CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 703 results for author: <span class="mathjax">Yang, T</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11221">arXiv:2411.11221</a> <span> [<a href="https://arxiv.org/pdf/2411.11221">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Driven Automatic Electrical Machine Preliminary Design with Artificial Intelligence Expert Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hailin Huang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+T">Tianjie Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jincai Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoran Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11221v1-abstract-short" style="display: inline;"> This paper presents a data-driven electrical machine design (EMD) framework using wound-rotor synchronous generator (WRSG) as a design example. Unlike traditional preliminary EMD processes that heavily rely on expertise, this framework leverages an artificial-intelligence based expert database, to provide preliminary designs directly from user specifications. Initial data is generated using 2D fin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11221v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11221v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11221v1-abstract-full" style="display: none;"> This paper presents a data-driven electrical machine design (EMD) framework using wound-rotor synchronous generator (WRSG) as a design example. Unlike traditional preliminary EMD processes that heavily rely on expertise, this framework leverages an artificial-intelligence based expert database, to provide preliminary designs directly from user specifications. Initial data is generated using 2D finite element (FE) machine models by sweeping fundamental design variables including machine length and diameter, enabling scalable machine geometry with machine performance for each design is recorded. This data trains a Metamodel of Optimal Prognosis (MOP)-based surrogate model, which maps design variables to key performance indicators (KPIs). Once trained, guided by metaheuristic algorithms, the surrogate model can generate thousands of geometric scalable designs, covering a wide power range, forming an AI expert database to guide future preliminary design. The framework is validated with a 30kVA WRSG design case. A prebuilt WRSG database, covering power from 10 to 60kVA, is validated by FE simulation. Design No.1138 is selected from database and compared with conventional design. Results show No.1138 achieves a higher power density of 2.21 kVA/kg in just 5 seconds, compared to 2.02 kVA/kg obtained using traditional method, which take several days. The developed AI expert database also serves as a high-quality data source for further developing AI models for automatic electrical machine design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11221v1-abstract-full').style.display = 'none'; document.getElementById('2411.11221v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09921">arXiv:2411.09921</a> <span> [<a href="https://arxiv.org/pdf/2411.09921">pdf</a>, <a href="https://arxiv.org/format/2411.09921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Motion-Grounded Video Reasoning: Understanding and Perceiving Motion at Pixel Level </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+A">Andong Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tongjia Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shoubin Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Taojiannan Yang</a>, <a href="/search/cs?searchtype=author&query=Spencer%2C+L">Lincoln Spencer</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a>, <a href="/search/cs?searchtype=author&query=Mian%2C+A+S">Ajmal Saeed Mian</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09921v1-abstract-short" style="display: inline;"> In this paper, we introduce Motion-Grounded Video Reasoning, a new motion understanding task that requires generating visual answers (video segmentation masks) according to the input question, and hence needs implicit spatiotemporal reasoning and grounding. This task extends existing spatiotemporal grounding work focusing on explicit action/motion grounding, to a more general format by enabling im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09921v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09921v1-abstract-full" style="display: none;"> In this paper, we introduce Motion-Grounded Video Reasoning, a new motion understanding task that requires generating visual answers (video segmentation masks) according to the input question, and hence needs implicit spatiotemporal reasoning and grounding. This task extends existing spatiotemporal grounding work focusing on explicit action/motion grounding, to a more general format by enabling implicit reasoning via questions. To facilitate the development of the new task, we collect a large-scale dataset called GROUNDMORE, which comprises 1,715 video clips, 249K object masks that are deliberately designed with 4 question types (Causal, Sequential, Counterfactual, and Descriptive) for benchmarking deep and comprehensive motion reasoning abilities. GROUNDMORE uniquely requires models to generate visual answers, providing a more concrete and visually interpretable response than plain texts. It evaluates models on both spatiotemporal grounding and reasoning, fostering to address complex challenges in motion-related video reasoning, temporal perception, and pixel-level understanding. Furthermore, we introduce a novel baseline model named Motion-Grounded Video Reasoning Assistant (MORA). MORA incorporates the multimodal reasoning ability from the Multimodal LLM, the pixel-level perception capability from the grounding model (SAM), and the temporal perception ability from a lightweight localization head. MORA achieves respectable performance on GROUNDMORE outperforming the best existing visual grounding baseline model by an average of 21.5% relatively. We hope this novel and challenging task will pave the way for future advancements in robust and general motion understanding via video reasoning segmentation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09921v1-abstract-full').style.display = 'none'; document.getElementById('2411.09921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08227">arXiv:2411.08227</a> <span> [<a href="https://arxiv.org/pdf/2411.08227">pdf</a>, <a href="https://arxiv.org/format/2411.08227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shawn Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+H">Huixian Gong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tiankai Yang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhengzhong Tu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08227v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is essential for ensuring the robustness of machine learning models by identifying samples that deviate from the training distribution. While traditional OOD detection has primarily focused on single-modality inputs, such as images, recent advances in multimodal models have demonstrated the potential of leveraging multiple modalities (e.g., video, optical flow,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08227v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08227v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is essential for ensuring the robustness of machine learning models by identifying samples that deviate from the training distribution. While traditional OOD detection has primarily focused on single-modality inputs, such as images, recent advances in multimodal models have demonstrated the potential of leveraging multiple modalities (e.g., video, optical flow, audio) to enhance detection performance. However, existing methods often overlook intra-class variability within in-distribution (ID) data, assuming that samples of the same class are perfectly cohesive and consistent. This assumption can lead to performance degradation, especially when prediction discrepancies are uniformly amplified across all samples. To address this issue, we propose Dynamic Prototype Updating (DPU), a novel plug-and-play framework for multimodal OOD detection that accounts for intra-class variations. Our method dynamically updates class center representations for each class by measuring the variance of similar samples within each batch, enabling adaptive adjustments. This approach allows us to amplify prediction discrepancies based on the updated class centers, thereby improving the model's robustness and generalization across different modalities. Extensive experiments on two tasks, five datasets, and nine base OOD algorithms demonstrate that DPU significantly improves OOD detection performance, setting a new state-of-the-art in multimodal OOD detection, with improvements of up to 80 percent in Far-OOD detection. To facilitate accessibility and reproducibility, our code is publicly available on GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08227v1-abstract-full').style.display = 'none'; document.getElementById('2411.08227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04933">arXiv:2411.04933</a> <span> [<a href="https://arxiv.org/pdf/2411.04933">pdf</a>, <a href="https://arxiv.org/format/2411.04933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SaSR-Net: Source-Aware Semantic Representation Network for Enhancing Audio-Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianyu Yang</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yiyang Nan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+L">Lisen Dai</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhenwen Liang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04933v3-abstract-short" style="display: inline;"> Audio-Visual Question Answering (AVQA) is a challenging task that involves answering questions based on both auditory and visual information in videos. A significant challenge is interpreting complex multi-modal scenes, which include both visual objects and sound sources, and connecting them to the given question. In this paper, we introduce the Source-aware Semantic Representation Network (SaSR-N… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04933v3-abstract-full').style.display = 'inline'; document.getElementById('2411.04933v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04933v3-abstract-full" style="display: none;"> Audio-Visual Question Answering (AVQA) is a challenging task that involves answering questions based on both auditory and visual information in videos. A significant challenge is interpreting complex multi-modal scenes, which include both visual objects and sound sources, and connecting them to the given question. In this paper, we introduce the Source-aware Semantic Representation Network (SaSR-Net), a novel model designed for AVQA. SaSR-Net utilizes source-wise learnable tokens to efficiently capture and align audio-visual elements with the corresponding question. It streamlines the fusion of audio and visual information using spatial and temporal attention mechanisms to identify answers in multi-modal scenes. Extensive experiments on the Music-AVQA and AVQA-Yang datasets show that SaSR-Net outperforms state-of-the-art AVQA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04933v3-abstract-full').style.display = 'none'; document.getElementById('2411.04933v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00408">arXiv:2411.00408</a> <span> [<a href="https://arxiv.org/pdf/2411.00408">pdf</a>, <a href="https://arxiv.org/format/2411.00408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Inference-to-complete: A High-performance and Programmable Data-plane Co-processor for Neural-network-driven Traffic Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dong Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhongpei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenglong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jie Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhigang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00408v1-abstract-short" style="display: inline;"> Neural-networks-driven intelligent data-plane (NN-driven IDP) is becoming an emerging topic for excellent accuracy and high performance. Meanwhile we argue that NN-driven IDP should satisfy three design goals: the flexibility to support various NNs models, the low-latency-high-throughput inference performance, and the data-plane-unawareness harming no performance and functionality. Unfortunately,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00408v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00408v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00408v1-abstract-full" style="display: none;"> Neural-networks-driven intelligent data-plane (NN-driven IDP) is becoming an emerging topic for excellent accuracy and high performance. Meanwhile we argue that NN-driven IDP should satisfy three design goals: the flexibility to support various NNs models, the low-latency-high-throughput inference performance, and the data-plane-unawareness harming no performance and functionality. Unfortunately, existing work either over-modify NNs for IDP, or insert inline pipelined accelerators into the data-plane, failing to meet the flexibility and unawareness goals. In this paper, we propose Kaleidoscope, a flexible and high-performance co-processor located at the bypass of the data-plane. To address the challenge of meeting three design goals, three key techniques are presented. The programmable run-to-completion accelerators are developed for flexible inference. To further improve performance, we design a scalable inference engine which completes low-latency and low-cost inference for the mouse flows, and perform complex NNs with high-accuracy for the elephant flows. Finally, raw-bytes-based NNs are introduced, which help to achieve unawareness. We prototype Kaleidoscope on both FPGA and ASIC library. In evaluation on six NNs models, Kaleidoscope reaches 256-352 ns inference latency and 100 Gbps throughput with negligible influence on the data-plane. The on-board tested NNs perform state-of-the-art accuracy among other NN-driven IDP, exhibiting the the significant impact of flexibility on enhancing traffic analysis accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00408v1-abstract-full').style.display = 'none'; document.getElementById('2411.00408v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00081">arXiv:2411.00081</a> <span> [<a href="https://arxiv.org/pdf/2411.00081">pdf</a>, <a href="https://arxiv.org/format/2411.00081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PARTNR: A Benchmark for Planning and Reasoning in Embodied Multi-agent Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+M">Matthew Chang</a>, <a href="/search/cs?searchtype=author&query=Chhablani%2C+G">Gunjan Chhablani</a>, <a href="/search/cs?searchtype=author&query=Clegg%2C+A">Alexander Clegg</a>, <a href="/search/cs?searchtype=author&query=Cote%2C+M+D">Mikael Dallaire Cote</a>, <a href="/search/cs?searchtype=author&query=Desai%2C+R">Ruta Desai</a>, <a href="/search/cs?searchtype=author&query=Hlavac%2C+M">Michal Hlavac</a>, <a href="/search/cs?searchtype=author&query=Karashchuk%2C+V">Vladimir Karashchuk</a>, <a href="/search/cs?searchtype=author&query=Krantz%2C+J">Jacob Krantz</a>, <a href="/search/cs?searchtype=author&query=Mottaghi%2C+R">Roozbeh Mottaghi</a>, <a href="/search/cs?searchtype=author&query=Parashar%2C+P">Priyam Parashar</a>, <a href="/search/cs?searchtype=author&query=Patki%2C+S">Siddharth Patki</a>, <a href="/search/cs?searchtype=author&query=Prasad%2C+I">Ishita Prasad</a>, <a href="/search/cs?searchtype=author&query=Puig%2C+X">Xavier Puig</a>, <a href="/search/cs?searchtype=author&query=Rai%2C+A">Akshara Rai</a>, <a href="/search/cs?searchtype=author&query=Ramrakhya%2C+R">Ram Ramrakhya</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+D">Daniel Tran</a>, <a href="/search/cs?searchtype=author&query=Truong%2C+J">Joanne Truong</a>, <a href="/search/cs?searchtype=author&query=Turner%2C+J+M">John M. Turner</a>, <a href="/search/cs?searchtype=author&query=Undersander%2C+E">Eric Undersander</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tsung-Yen Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00081v1-abstract-short" style="display: inline;"> We present a benchmark for Planning And Reasoning Tasks in humaN-Robot collaboration (PARTNR) designed to study human-robot coordination in household activities. PARTNR tasks exhibit characteristics of everyday tasks, such as spatial, temporal, and heterogeneous agent capability constraints. We employ a semi-automated task generation pipeline using Large Language Models (LLMs), incorporating simul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00081v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00081v1-abstract-full" style="display: none;"> We present a benchmark for Planning And Reasoning Tasks in humaN-Robot collaboration (PARTNR) designed to study human-robot coordination in household activities. PARTNR tasks exhibit characteristics of everyday tasks, such as spatial, temporal, and heterogeneous agent capability constraints. We employ a semi-automated task generation pipeline using Large Language Models (LLMs), incorporating simulation in the loop for grounding and verification. PARTNR stands as the largest benchmark of its kind, comprising 100,000 natural language tasks, spanning 60 houses and 5,819 unique objects. We analyze state-of-the-art LLMs on PARTNR tasks, across the axes of planning, perception and skill execution. The analysis reveals significant limitations in SoTA models, such as poor coordination and failures in task tracking and recovery from errors. When LLMs are paired with real humans, they require 1.5x as many steps as two humans collaborating and 1.1x more steps than a single human, underscoring the potential for improvement in these models. We further show that fine-tuning smaller LLMs with planning data can achieve performance on par with models 9 times larger, while being 8.6x faster at inference. Overall, PARTNR highlights significant challenges facing collaborative embodied agents and aims to drive research in this direction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00081v1-abstract-full').style.display = 'none'; document.getElementById('2411.00081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Alphabetical author order</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24001">arXiv:2410.24001</a> <span> [<a href="https://arxiv.org/pdf/2410.24001">pdf</a>, <a href="https://arxiv.org/format/2410.24001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ImOV3D: Learning Open-Vocabulary Point Clouds 3D Object Detection from Only 2D Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Timing Yang</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yuanliang Ju</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Li Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24001v1-abstract-short" style="display: inline;"> Open-vocabulary 3D object detection (OV-3Det) aims to generalize beyond the limited number of base categories labeled during the training phase. The biggest bottleneck is the scarcity of annotated 3D data, whereas 2D image datasets are abundant and richly annotated. Consequently, it is intuitive to leverage the wealth of annotations in 2D images to alleviate the inherent data scarcity in OV-3Det.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24001v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24001v1-abstract-full" style="display: none;"> Open-vocabulary 3D object detection (OV-3Det) aims to generalize beyond the limited number of base categories labeled during the training phase. The biggest bottleneck is the scarcity of annotated 3D data, whereas 2D image datasets are abundant and richly annotated. Consequently, it is intuitive to leverage the wealth of annotations in 2D images to alleviate the inherent data scarcity in OV-3Det. In this paper, we push the task setup to its limits by exploring the potential of using solely 2D images to learn OV-3Det. The major challenges for this setup is the modality gap between training images and testing point clouds, which prevents effective integration of 2D knowledge into OV-3Det. To address this challenge, we propose a novel framework ImOV3D to leverage pseudo multimodal representation containing both images and point clouds (PC) to close the modality gap. The key of ImOV3D lies in flexible modality conversion where 2D images can be lifted into 3D using monocular depth estimation and can also be derived from 3D scenes through rendering. This allows unifying both training images and testing point clouds into a common image-PC representation, encompassing a wealth of 2D semantic information and also incorporating the depth and structural characteristics of 3D spatial data. We carefully conduct such conversion to minimize the domain gap between training and test cases. Extensive experiments on two benchmark datasets, SUNRGBD and ScanNet, show that ImOV3D significantly outperforms existing methods, even in the absence of ground truth 3D training data. With the inclusion of a minimal amount of real 3D data for fine-tuning, the performance also significantly surpasses previous state-of-the-art. Codes and pre-trained models are released on the https://github.com/yangtiming/ImOV3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24001v1-abstract-full').style.display = 'none'; document.getElementById('2410.24001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024. Code link https://github.com/yangtiming/ImOV3D</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> NeurIPS 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23757">arXiv:2410.23757</a> <span> [<a href="https://arxiv.org/pdf/2410.23757">pdf</a>, <a href="https://arxiv.org/format/2410.23757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Identify Then Recommend: Towards Unsupervised Group Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wenliang Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23757v1-abstract-short" style="display: inline;"> Group Recommendation (GR), which aims to recommend items to groups of users, has become a promising and practical direction for recommendation systems. This paper points out two issues of the state-of-the-art GR models. (1) The pre-defined and fixed number of user groups is inadequate for real-time industrial recommendation systems, where the group distribution can shift dynamically. (2) The train… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23757v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23757v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23757v1-abstract-full" style="display: none;"> Group Recommendation (GR), which aims to recommend items to groups of users, has become a promising and practical direction for recommendation systems. This paper points out two issues of the state-of-the-art GR models. (1) The pre-defined and fixed number of user groups is inadequate for real-time industrial recommendation systems, where the group distribution can shift dynamically. (2) The training schema of existing GR methods is supervised, necessitating expensive user-group and group-item labels, leading to significant annotation costs. To this end, we present a novel unsupervised group recommendation framework named \underline{I}dentify \underline{T}hen \underline{R}ecommend (\underline{ITR}), where it first identifies the user groups in an unsupervised manner even without the pre-defined number of groups, and then two pre-text tasks are designed to conduct self-supervised group recommendation. Concretely, at the group identification stage, we first estimate the adaptive density of each user point, where areas with higher densities are more likely to be recognized as group centers. Then, a heuristic merge-and-split strategy is designed to discover the user groups and decision boundaries. Subsequently, at the self-supervised learning stage, the pull-and-repulsion pre-text task is proposed to optimize the user-group distribution. Besides, the pseudo group recommendation pre-text task is designed to assist the recommendations. Extensive experiments demonstrate the superiority and effectiveness of ITR on both user recommendation (e.g., 22.22\% NDCG@5 $\uparrow$) and group recommendation (e.g., 22.95\% NDCG@5 $\uparrow$). Furthermore, we deploy ITR on the industrial recommender and achieve promising results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23757v1-abstract-full').style.display = 'none'; document.getElementById('2410.23757v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23330">arXiv:2410.23330</a> <span> [<a href="https://arxiv.org/pdf/2410.23330">pdf</a>, <a href="https://arxiv.org/format/2410.23330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CLIPErase: Efficient Unlearning of Visual-Textual Associations in CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianyu Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+L">Lisen Dai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangqi Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23330v1-abstract-short" style="display: inline;"> Machine unlearning (MU) has gained significant attention as a means to remove specific data from trained models without requiring a full retraining process. While progress has been made in unimodal domains like text and image classification, unlearning in multimodal models remains relatively underexplored. In this work, we address the unique challenges of unlearning in CLIP, a prominent multimodal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23330v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23330v1-abstract-full" style="display: none;"> Machine unlearning (MU) has gained significant attention as a means to remove specific data from trained models without requiring a full retraining process. While progress has been made in unimodal domains like text and image classification, unlearning in multimodal models remains relatively underexplored. In this work, we address the unique challenges of unlearning in CLIP, a prominent multimodal model that aligns visual and textual representations. We introduce CLIPErase, a novel approach that disentangles and selectively forgets both visual and textual associations, ensuring that unlearning does not compromise model performance. CLIPErase consists of three key modules: a Forgetting Module that disrupts the associations in the forget set, a Retention Module that preserves performance on the retain set, and a Consistency Module that maintains consistency with the original model. Extensive experiments on the CIFAR-100 and Flickr30K datasets across four CLIP downstream tasks demonstrate that CLIPErase effectively forgets designated associations in zero-shot tasks for multimodal samples, while preserving the model's performance on the retain set after unlearning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23330v1-abstract-full').style.display = 'none'; document.getElementById('2410.23330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23152">arXiv:2410.23152</a> <span> [<a href="https://arxiv.org/pdf/2410.23152">pdf</a>, <a href="https://arxiv.org/format/2410.23152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Strongly Correlated Electrons">cond-mat.str-el</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> When can classical neural networks represent quantum states? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tai-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Soleimanifar%2C+M">Mehdi Soleimanifar</a>, <a href="/search/cs?searchtype=author&query=Bergamaschi%2C+T">Thiago Bergamaschi</a>, <a href="/search/cs?searchtype=author&query=Preskill%2C+J">John Preskill</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23152v1-abstract-short" style="display: inline;"> A naive classical representation of an n-qubit state requires specifying exponentially many amplitudes in the computational basis. Past works have demonstrated that classical neural networks can succinctly express these amplitudes for many physically relevant states, leading to computationally powerful representations known as neural quantum states. What underpins the efficacy of such representati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23152v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23152v1-abstract-full" style="display: none;"> A naive classical representation of an n-qubit state requires specifying exponentially many amplitudes in the computational basis. Past works have demonstrated that classical neural networks can succinctly express these amplitudes for many physically relevant states, leading to computationally powerful representations known as neural quantum states. What underpins the efficacy of such representations? We show that conditional correlations present in the measurement distribution of quantum states control the performance of their neural representations. Such conditional correlations are basis dependent, arise due to measurement-induced entanglement, and reveal features not accessible through conventional few-body correlations often examined in studies of phases of matter. By combining theoretical and numerical analysis, we demonstrate how the state's entanglement and sign structure, along with the choice of measurement basis, give rise to distinct patterns of short- or long-range conditional correlations. Our findings provide a rigorous framework for exploring the expressive power of neural quantum states. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23152v1-abstract-full').style.display = 'none'; document.getElementById('2410.23152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">37 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21418">arXiv:2410.21418</a> <span> [<a href="https://arxiv.org/pdf/2410.21418">pdf</a>, <a href="https://arxiv.org/format/2410.21418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Manufacturing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yi Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jie Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianze Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaochen Xu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yanjun Lyu</a>, <a href="/search/cs?searchtype=author&query=Blenk%2C+P">Parker Blenk</a>, <a href="/search/cs?searchtype=author&query=Pence%2C+J">Jacob Pence</a>, <a href="/search/cs?searchtype=author&query=Rupram%2C+J">Jason Rupram</a>, <a href="/search/cs?searchtype=author&query=Banu%2C+E">Eliza Banu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Linbing Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Wenzhan Song</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoming Zhai</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kenan Song</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dajiang Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Beiwen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21418v1-abstract-short" style="display: inline;"> The rapid advances in Large Language Models (LLMs) have the potential to transform manufacturing industry, offering new opportunities to optimize processes, improve efficiency, and drive innovation. This paper provides a comprehensive exploration of the integration of LLMs into the manufacturing domain, focusing on their potential to automate and enhance various aspects of manufacturing, from prod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21418v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21418v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21418v1-abstract-full" style="display: none;"> The rapid advances in Large Language Models (LLMs) have the potential to transform manufacturing industry, offering new opportunities to optimize processes, improve efficiency, and drive innovation. This paper provides a comprehensive exploration of the integration of LLMs into the manufacturing domain, focusing on their potential to automate and enhance various aspects of manufacturing, from product design and development to quality control, supply chain optimization, and talent management. Through extensive evaluations across multiple manufacturing tasks, we demonstrate the remarkable capabilities of state-of-the-art LLMs, such as GPT-4V, in understanding and executing complex instructions, extracting valuable insights from vast amounts of data, and facilitating knowledge sharing. We also delve into the transformative potential of LLMs in reshaping manufacturing education, automating coding processes, enhancing robot control systems, and enabling the creation of immersive, data-rich virtual environments through the industrial metaverse. By highlighting the practical applications and emerging use cases of LLMs in manufacturing, this paper aims to provide a valuable resource for professionals, researchers, and decision-makers seeking to harness the power of these technologies to address real-world challenges, drive operational excellence, and unlock sustainable growth in an increasingly competitive landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21418v1-abstract-full').style.display = 'none'; document.getElementById('2410.21418v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20727">arXiv:2410.20727</a> <span> [<a href="https://arxiv.org/pdf/2410.20727">pdf</a>, <a href="https://arxiv.org/format/2410.20727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Faster WIND: Accelerating Iterative Best-of-$N$ Distillation for LLM Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jincheng Mei</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Hanjun Dai</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Z">Zixin Wen</a>, <a href="/search/cs?searchtype=author&query=Cen%2C+S">Shicong Cen</a>, <a href="/search/cs?searchtype=author&query=Schuurmans%2C+D">Dale Schuurmans</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+Y">Yuejie Chi</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+B">Bo Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20727v1-abstract-short" style="display: inline;"> Recent advances in aligning large language models with human preferences have corroborated the growing importance of best-of-N distillation (BOND). However, the iterative BOND algorithm is prohibitively expensive in practice due to the sample and computation inefficiency. This paper addresses the problem by revealing a unified game-theoretic connection between iterative BOND and self-play alignmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20727v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20727v1-abstract-full" style="display: none;"> Recent advances in aligning large language models with human preferences have corroborated the growing importance of best-of-N distillation (BOND). However, the iterative BOND algorithm is prohibitively expensive in practice due to the sample and computation inefficiency. This paper addresses the problem by revealing a unified game-theoretic connection between iterative BOND and self-play alignment, which unifies seemingly disparate algorithmic paradigms. Based on the connection, we establish a novel framework, WIN rate Dominance (WIND), with a series of efficient algorithms for regularized win rate dominance optimization that approximates iterative BOND in the parameter space. We provides provable sample efficiency guarantee for one of the WIND variant with the square loss objective. The experimental results confirm that our algorithm not only accelerates the computation, but also achieves superior sample efficiency compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20727v1-abstract-full').style.display = 'none'; document.getElementById('2410.20727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20381">arXiv:2410.20381</a> <span> [<a href="https://arxiv.org/pdf/2410.20381">pdf</a>, <a href="https://arxiv.org/format/2410.20381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Effective Retrieval of Dense-Sparse Hybrid Vectors using Graph-based Approximate Nearest Neighbor Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhenhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shulin Zeng</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+M">Maojia Sheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Guohao Dai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20381v1-abstract-short" style="display: inline;"> ANNS for embedded vector representations of texts is commonly used in information retrieval, with two important information representations being sparse and dense vectors. While it has been shown that combining these representations improves accuracy, the current method of conducting sparse and dense vector searches separately suffers from low scalability and high system complexity. Alternatively,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20381v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20381v1-abstract-full" style="display: none;"> ANNS for embedded vector representations of texts is commonly used in information retrieval, with two important information representations being sparse and dense vectors. While it has been shown that combining these representations improves accuracy, the current method of conducting sparse and dense vector searches separately suffers from low scalability and high system complexity. Alternatively, building a unified index faces challenges with accuracy and efficiency. To address these issues, we propose a graph-based ANNS algorithm for dense-sparse hybrid vectors. Firstly, we propose a distribution alignment method to improve accuracy, which pre-samples dense and sparse vectors to analyze their distance distribution statistic, resulting in a 1%$\sim$9% increase in accuracy. Secondly, to improve efficiency, we design an adaptive two-stage computation strategy that initially computes dense distances only and later computes hybrid distances. Further, we prune the sparse vectors to speed up the calculation. Compared to naive implementation, we achieve $\sim2.1\times$ acceleration. Thorough experiments show that our algorithm achieves 8.9x$\sim$11.7x throughput at equal accuracy compared to existing hybrid vector search algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20381v1-abstract-full').style.display = 'none'; document.getElementById('2410.20381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18701">arXiv:2410.18701</a> <span> [<a href="https://arxiv.org/pdf/2410.18701">pdf</a>, <a href="https://arxiv.org/format/2410.18701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BATON: Enhancing Batch-wise Inference Efficiency for Large Language Models via Dynamic Re-batching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cong%2C+P">Peizhuang Cong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qizhi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haochen Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18701v1-abstract-short" style="display: inline;"> The advanced capabilities of Large Language Models (LLMs) have inspired the development of various interactive web services or applications, such as ChatGPT, which offer query inference services for users. Unlike traditional DNN model, the inference of LLM entails different iterations of forward computation for different queries, which result in efficiency challenges for existing run-to-completion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18701v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18701v1-abstract-full" style="display: none;"> The advanced capabilities of Large Language Models (LLMs) have inspired the development of various interactive web services or applications, such as ChatGPT, which offer query inference services for users. Unlike traditional DNN model, the inference of LLM entails different iterations of forward computation for different queries, which result in efficiency challenges for existing run-to-completion batch-wise inference. Hence, some methods refine batch-wise inference to iteration-level by duplicating all nonlinear layers of LLM. However, this approach not only increases resource usage but also introduces idle computations to the batch due to the prefilling of newly added queries. Therefore, we propose BATON, an efficient batch-wise LLM inference scheme by dynamically adjusting processing batch, which can achieve near-zero idle computations without incurring additional resource consumption. To do so, BATON 1) shapes the vectors involved in the inference of the newly inserted query and processing batch to align dimensions and generates a new attention mask based on vector shaping to ensure inference correctness, which enables query inserting without consuming additional resource; 2) embeds prefilled Keys and Values of the new query into the KV_Cache of the processing batch by leveraging the prefilling and decoding separation mechanism, eliminating idle computations to the batch introduced by the prefilling process of the new query. Experimental results show that compared to the state-of-the-art solution Orca, BATON improves query processing by up to 1.75 times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18701v1-abstract-full').style.display = 'none'; document.getElementById('2410.18701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17159">arXiv:2410.17159</a> <span> [<a href="https://arxiv.org/pdf/2410.17159">pdf</a>, <a href="https://arxiv.org/format/2410.17159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LiNo: Advancing Recursive Residual Decomposition of Linear and Nonlinear Patterns for Robust Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guoqi Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoming Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaoyu Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dayu Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shujun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17159v1-abstract-short" style="display: inline;"> Forecasting models are pivotal in a data-driven world with vast volumes of time series data that appear as a compound of vast Linear and Nonlinear patterns. Recent deep time series forecasting models struggle to utilize seasonal and trend decomposition to separate the entangled components. Such a strategy only explicitly extracts simple linear patterns like trends, leaving the other linear modes a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17159v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17159v1-abstract-full" style="display: none;"> Forecasting models are pivotal in a data-driven world with vast volumes of time series data that appear as a compound of vast Linear and Nonlinear patterns. Recent deep time series forecasting models struggle to utilize seasonal and trend decomposition to separate the entangled components. Such a strategy only explicitly extracts simple linear patterns like trends, leaving the other linear modes and vast unexplored nonlinear patterns to the residual. Their flawed linear and nonlinear feature extraction models and shallow-level decomposition limit their adaptation to the diverse patterns present in real-world scenarios. Given this, we innovate Recursive Residual Decomposition by introducing explicit extraction of both linear and nonlinear patterns. This deeper-level decomposition framework, which is named LiNo, captures linear patterns using a Li block which can be a moving average kernel, and models nonlinear patterns using a No block which can be a Transformer encoder. The extraction of these two patterns is performed alternatively and recursively. To achieve the full potential of LiNo, we develop the current simple linear pattern extractor to a general learnable autoregressive model, and design a novel No block that can handle all essential nonlinear patterns. Remarkably, the proposed LiNo achieves state-of-the-art on thirteen real-world benchmarks under univariate and multivariate forecasting scenarios. Experiments show that current forecasting models can deliver more robust and precise results through this advanced Recursive Residual Decomposition. We hope this work could offer insight into designing more effective forecasting models. Code is available at this Repository: https://github.com/Levi-Ackman/LiNo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17159v1-abstract-full').style.display = 'none'; document.getElementById('2410.17159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16140">arXiv:2410.16140</a> <span> [<a href="https://arxiv.org/pdf/2410.16140">pdf</a>, <a href="https://arxiv.org/format/2410.16140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Cooperative Multistatic Target Detection in Cell-Free Communication Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuangyang Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yi Song</a>, <a href="/search/cs?searchtype=author&query=Zhi%2C+K">Kangda Zhi</a>, <a href="/search/cs?searchtype=author&query=Caire%2C+G">Giuseppe Caire</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16140v1-abstract-short" style="display: inline;"> In this work, we consider the target detection problem in a multistatic integrated sensing and communication (ISAC) scenario characterized by the cell-free MIMO communication network deployment, where multiple radio units (RUs) in the network cooperate with each other for the sensing task. By exploiting the angle resolution from multiple arrays deployed in the network and the delay resolution from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16140v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16140v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16140v1-abstract-full" style="display: none;"> In this work, we consider the target detection problem in a multistatic integrated sensing and communication (ISAC) scenario characterized by the cell-free MIMO communication network deployment, where multiple radio units (RUs) in the network cooperate with each other for the sensing task. By exploiting the angle resolution from multiple arrays deployed in the network and the delay resolution from the communication signals, i.e., orthogonal frequency division multiplexing (OFDM) signals, we formulate a cooperative sensing problem with coherent data fusion of multiple RUs' observations and propose a sparse Bayesian learning (SBL)-based method, where the global coordinates of target locations are directly detected. Intensive numerical results indicate promising target detection performance of the proposed SBL-based method. Additionally, a theoretical analysis of the considered cooperative multistatic sensing task is provided using the pairwise error probability (PEP) analysis, which can be used to provide design insights, e.g., illumination and beam patterns, for the considered problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16140v1-abstract-full').style.display = 'none'; document.getElementById('2410.16140v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to WCNC 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14875">arXiv:2410.14875</a> <span> [<a href="https://arxiv.org/pdf/2410.14875">pdf</a>, <a href="https://arxiv.org/format/2410.14875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Which LLMs are Difficult to Detect? A Detailed Analysis of Potential Factors Contributing to Difficulties in LLM Text Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thorat%2C+S">Shantanu Thorat</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14875v1-abstract-short" style="display: inline;"> As LLMs increase in accessibility, LLM-generated texts have proliferated across several fields, such as scientific, academic, and creative writing. However, LLMs are not created equally; they may have different architectures and training datasets. Thus, some LLMs may be more challenging to detect than others. Using two datasets spanning four total writing domains, we train AI-generated (AIG) text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14875v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14875v1-abstract-full" style="display: none;"> As LLMs increase in accessibility, LLM-generated texts have proliferated across several fields, such as scientific, academic, and creative writing. However, LLMs are not created equally; they may have different architectures and training datasets. Thus, some LLMs may be more challenging to detect than others. Using two datasets spanning four total writing domains, we train AI-generated (AIG) text classifiers using the LibAUC library - a deep learning library for training classifiers with imbalanced datasets. Our results in the Deepfake Text dataset show that AIG-text detection varies across domains, with scientific writing being relatively challenging. In the Rewritten Ivy Panda (RIP) dataset focusing on student essays, we find that the OpenAI family of LLMs was substantially difficult for our classifiers to distinguish from human texts. Additionally, we explore possible factors that could explain the difficulties in detecting OpenAI-generated texts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14875v1-abstract-full').style.display = 'none'; document.getElementById('2410.14875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024 - Safe Generative AI Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14249">arXiv:2410.14249</a> <span> [<a href="https://arxiv.org/pdf/2410.14249">pdf</a>, <a href="https://arxiv.org/format/2410.14249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Tactile Feedback Approach to Path Recovery after High-Speed Impacts for Collision-Resilient Drones </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bredenbeck%2C+A">Anton Bredenbeck</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Teaya Yang</a>, <a href="/search/cs?searchtype=author&query=Hamaza%2C+S">Salua Hamaza</a>, <a href="/search/cs?searchtype=author&query=Mueller%2C+M+W">Mark W. Mueller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14249v1-abstract-short" style="display: inline;"> Aerial robots are a well-established solution for exploration, monitoring, and inspection, thanks to their superior maneuverability and agility. However, in many environments of interest, they risk crashing and sustaining damage following collisions. Traditional methods focus on avoiding obstacles entirely to prevent damage, but these approaches can be limiting, particularly in complex environment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14249v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14249v1-abstract-full" style="display: none;"> Aerial robots are a well-established solution for exploration, monitoring, and inspection, thanks to their superior maneuverability and agility. However, in many environments of interest, they risk crashing and sustaining damage following collisions. Traditional methods focus on avoiding obstacles entirely to prevent damage, but these approaches can be limiting, particularly in complex environments where collisions may be unavoidable, or on weight and compute-constrained platforms. This paper presents a novel approach to enhance the robustness and autonomy of drones in such scenarios by developing a path recovery and adjustment method for a high-speed collision-resistant drone equipped with binary contact sensors. The proposed system employs an estimator that explicitly models collisions, using pre-collision velocities and rates to predict post-collision dynamics, thereby improving the drone's state estimation accuracy. Additionally, we introduce a vector-field-based path representation which guarantees convergence to the path. Post-collision, the contact point is incorporated into the vector field as a repulsive potential, enabling the drone to avoid obstacles while naturally converging to the original path. The effectiveness of this method is validated through Monte Carlo simulations and demonstrated on a physical prototype, showing successful path following and adjustment through collisions as well as recovery from collisions at speeds up to 3.7 m / s. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14249v1-abstract-full').style.display = 'none'; document.getElementById('2410.14249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09235">arXiv:2410.09235</a> <span> [<a href="https://arxiv.org/pdf/2410.09235">pdf</a>, <a href="https://arxiv.org/ps/2410.09235">ps</a>, <a href="https://arxiv.org/format/2410.09235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Relative-error monotonicity testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=De%2C+A">Anindya De</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yizhi Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuhao Li</a>, <a href="/search/cs?searchtype=author&query=Nadimpalli%2C+S">Shivam Nadimpalli</a>, <a href="/search/cs?searchtype=author&query=Servedio%2C+R+A">Rocco A. Servedio</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianqi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09235v1-abstract-short" style="display: inline;"> The standard model of Boolean function property testing is not well suited for testing $\textit{sparse}$ functions which have few satisfying assignments, since every such function is close (in the usual Hamming distance metric) to the constant-0 function. In this work we propose and investigate a new model for property testing of Boolean functions, called $\textit{relative-error testing}$, which p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09235v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09235v1-abstract-full" style="display: none;"> The standard model of Boolean function property testing is not well suited for testing $\textit{sparse}$ functions which have few satisfying assignments, since every such function is close (in the usual Hamming distance metric) to the constant-0 function. In this work we propose and investigate a new model for property testing of Boolean functions, called $\textit{relative-error testing}$, which provides a natural framework for testing sparse functions. This new model defines the distance between two functions $f, g: \{0,1\}^n \to \{0,1\}$ to be $$\textsf{reldist}(f,g) := { \frac{|f^{-1}(1) \triangle g^{-1}(1)|} {|f^{-1}(1)|}}.$$ This is a more demanding distance measure than the usual Hamming distance ${ {|f^{-1}(1) \triangle g^{-1}(1)|}/{2^n}}$ when $|f^{-1}(1)| \ll 2^n$; to compensate for this, algorithms in the new model have access both to a black-box oracle for the function $f$ being tested and to a source of independent uniform satisfying assignments of $f$. In this paper we first give a few general results about the relative-error testing model; then, as our main technical contribution, we give a detailed study of algorithms and lower bounds for relative-error testing of $\textit{monotone}$ Boolean functions. We give upper and lower bounds which are parameterized by $N=|f^{-1}(1)|$, the sparsity of the function $f$ being tested. Our results show that there are interesting differences between relative-error monotonicity testing of sparse Boolean functions, and monotonicity testing in the standard model. These results motivate further study of the testability of Boolean function properties in the relative-error model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09235v1-abstract-full').style.display = 'none'; document.getElementById('2410.09235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09199">arXiv:2410.09199</a> <span> [<a href="https://arxiv.org/pdf/2410.09199">pdf</a>, <a href="https://arxiv.org/format/2410.09199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Efficient Contrastive Unimodal Pretraining Method for EHR Time Series Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=King%2C+R">Ryan King</a>, <a href="/search/cs?searchtype=author&query=Kodali%2C+S">Shivesh Kodali</a>, <a href="/search/cs?searchtype=author&query=Krueger%2C+C">Conrad Krueger</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a>, <a href="/search/cs?searchtype=author&query=Mortazavi%2C+B+J">Bobak J. Mortazavi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09199v1-abstract-short" style="display: inline;"> Machine learning has revolutionized the modeling of clinical timeseries data. Using machine learning, a Deep Neural Network (DNN) can be automatically trained to learn a complex mapping of its input features for a desired task. This is particularly valuable in Electronic Health Record (EHR) databases, where patients often spend extended periods in intensive care units (ICUs). Machine learning serv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09199v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09199v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09199v1-abstract-full" style="display: none;"> Machine learning has revolutionized the modeling of clinical timeseries data. Using machine learning, a Deep Neural Network (DNN) can be automatically trained to learn a complex mapping of its input features for a desired task. This is particularly valuable in Electronic Health Record (EHR) databases, where patients often spend extended periods in intensive care units (ICUs). Machine learning serves as an efficient method for extract meaningful information. However, many state-of-the-art (SOTA) methods for training DNNs demand substantial volumes of labeled data, posing significant challenges for clinics in terms of cost and time. Self-supervised learning offers an alternative by allowing practitioners to extract valuable insights from data without the need for costly labels. Yet, current SOTA methods often necessitate large data batches to achieve optimal performance, increasing computational demands. This presents a challenge when working with long clinical timeseries data. To address this, we propose an efficient method of contrastive pretraining tailored for long clinical timeseries data. Our approach utilizes an estimator for negative pair comparison, enabling effective feature extraction. We assess the efficacy of our pretraining using standard self-supervised tasks such as linear evaluation and semi-supervised learning. Additionally, our model demonstrates the ability to impute missing measurements, providing clinicians with deeper insights into patient conditions. We demonstrate that our pretraining is capable of achieving better performance as both the size of the model and the size of the measurement vocabulary scale. Finally, we externally validate our model, trained on the MIMIC-III dataset, using the eICU dataset. We demonstrate that our model is capable of learning robust clinical information that is transferable to other clinics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09199v1-abstract-full').style.display = 'none'; document.getElementById('2410.09199v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09156">arXiv:2410.09156</a> <span> [<a href="https://arxiv.org/pdf/2410.09156">pdf</a>, <a href="https://arxiv.org/format/2410.09156">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> On Discriminative Probabilistic Modeling for Self-Supervised Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bokun Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yunwen Lei</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+Y">Yiming Ying</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09156v1-abstract-short" style="display: inline;"> We study the discriminative probabilistic modeling problem on a continuous domain for (multimodal) self-supervised representation learning. To address the challenge of computing the integral in the partition function for each anchor data, we leverage the multiple importance sampling (MIS) technique for robust Monte Carlo integration, which can recover InfoNCE-based contrastive loss as a special ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09156v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09156v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09156v1-abstract-full" style="display: none;"> We study the discriminative probabilistic modeling problem on a continuous domain for (multimodal) self-supervised representation learning. To address the challenge of computing the integral in the partition function for each anchor data, we leverage the multiple importance sampling (MIS) technique for robust Monte Carlo integration, which can recover InfoNCE-based contrastive loss as a special case. Within this probabilistic modeling framework, we conduct generalization error analysis to reveal the limitation of current InfoNCE-based contrastive loss for self-supervised representation learning and derive insights for developing better approaches by reducing the error of Monte Carlo integration. To this end, we propose a novel non-parametric method for approximating the sum of conditional densities required by MIS through convex optimization, yielding a new contrastive objective for self-supervised representation learning. Moreover, we design an efficient algorithm for solving the proposed objective. We empirically compare our algorithm to representative baselines on the contrastive image-language pretraining task. Experimental results on the CC3M and CC12M datasets demonstrate the superior overall performance of our algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09156v1-abstract-full').style.display = 'none'; document.getElementById('2410.09156v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08027">arXiv:2410.08027</a> <span> [<a href="https://arxiv.org/pdf/2410.08027">pdf</a>, <a href="https://arxiv.org/format/2410.08027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Private Language Models via Truncated Laplacian Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tianhao Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Habernal%2C+I">Ivan Habernal</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08027v1-abstract-short" style="display: inline;"> Deep learning models for NLP tasks are prone to variants of privacy attacks. To prevent privacy leakage, researchers have investigated word-level perturbations, relying on the formal guarantees of differential privacy (DP) in the embedding space. However, many existing approaches either achieve unsatisfactory performance in the high privacy regime when using the Laplacian or Gaussian mechanism, or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08027v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08027v1-abstract-full" style="display: none;"> Deep learning models for NLP tasks are prone to variants of privacy attacks. To prevent privacy leakage, researchers have investigated word-level perturbations, relying on the formal guarantees of differential privacy (DP) in the embedding space. However, many existing approaches either achieve unsatisfactory performance in the high privacy regime when using the Laplacian or Gaussian mechanism, or resort to weaker relaxations of DP that are inferior to the canonical DP in terms of privacy strength. This raises the question of whether a new method for private word embedding can be designed to overcome these limitations. In this paper, we propose a novel private embedding method called the high dimensional truncated Laplacian mechanism. Specifically, we introduce a non-trivial extension of the truncated Laplacian mechanism, which was previously only investigated in one-dimensional space cases. Theoretically, we show that our method has a lower variance compared to the previous private word embedding methods. To further validate its effectiveness, we conduct comprehensive experiments on private embedding and downstream tasks using three datasets. Remarkably, even in the high privacy regime, our approach only incurs a slight decrease in utility compared to the non-private scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08027v1-abstract-full').style.display = 'none'; document.getElementById('2410.08027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024, Main Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07599">arXiv:2410.07599</a> <span> [<a href="https://arxiv.org/pdf/2410.07599">pdf</a>, <a href="https://arxiv.org/format/2410.07599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Causal Image Modeling for Efficient Visual Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Timing Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yaodong Yu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Sucheng Ren</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+G">Guoyizhe Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Angtian Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wei Shao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuyin Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+C">Cihang Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07599v1-abstract-short" style="display: inline;"> In this work, we present a comprehensive analysis of causal image modeling and introduce the Adventurer series models where we treat images as sequences of patch tokens and employ uni-directional language models to learn visual representations. This modeling paradigm allows us to process images in a recurrent formulation with linear complexity relative to the sequence length, which can effectively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07599v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07599v1-abstract-full" style="display: none;"> In this work, we present a comprehensive analysis of causal image modeling and introduce the Adventurer series models where we treat images as sequences of patch tokens and employ uni-directional language models to learn visual representations. This modeling paradigm allows us to process images in a recurrent formulation with linear complexity relative to the sequence length, which can effectively address the memory and computation explosion issues posed by high-resolution and fine-grained images. In detail, we introduce two simple designs that seamlessly integrate image inputs into the causal inference framework: a global pooling token placed at the beginning of the sequence and a flipping operation between every two layers. Extensive empirical studies demonstrate the significant efficiency and effectiveness of this causal image modeling paradigm. For example, our base-sized Adventurer model attains a competitive test accuracy of 84.0% on the standard ImageNet-1k benchmark with 216 images/s training throughput, which is 5.3 times more efficient than vision transformers to achieve the same result. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07599v1-abstract-full').style.display = 'none'; document.getElementById('2410.07599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07093">arXiv:2410.07093</a> <span> [<a href="https://arxiv.org/pdf/2410.07093">pdf</a>, <a href="https://arxiv.org/format/2410.07093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LaMP: Language-Motion Pretraining for Motion Generation, Retrieval, and Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weihao Yuan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yisheng He</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lingteng Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shenhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaodong Gu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Weichao Shen</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuan Dong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zilong Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L+T">Laurence T. Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07093v1-abstract-short" style="display: inline;"> Language plays a vital role in the realm of human motion. Existing methods have largely depended on CLIP text embeddings for motion generation, yet they fall short in effectively aligning language and motion due to CLIP's pretraining on static image-text pairs. This work introduces LaMP, a novel Language-Motion Pretraining model, which transitions from a language-vision to a more suitable language… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07093v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07093v1-abstract-full" style="display: none;"> Language plays a vital role in the realm of human motion. Existing methods have largely depended on CLIP text embeddings for motion generation, yet they fall short in effectively aligning language and motion due to CLIP's pretraining on static image-text pairs. This work introduces LaMP, a novel Language-Motion Pretraining model, which transitions from a language-vision to a more suitable language-motion latent space. It addresses key limitations by generating motion-informative text embeddings, significantly enhancing the relevance and semantics of generated motion sequences. With LaMP, we advance three key tasks: text-to-motion generation, motion-text retrieval, and motion captioning through aligned language-motion representation learning. For generation, we utilize LaMP to provide the text condition instead of CLIP, and an autoregressive masked prediction is designed to achieve mask modeling without rank collapse in transformers. For retrieval, motion features from LaMP's motion transformer interact with query tokens to retrieve text features from the text transformer, and vice versa. For captioning, we finetune a large language model with the language-informative motion features to develop a strong motion captioning model. In addition, we introduce the LaMP-BertScore metric to assess the alignment of generated motions with textual descriptions. Extensive experimental results on multiple datasets demonstrate substantial improvements over previous methods across all three tasks. The code of our method will be made public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07093v1-abstract-full').style.display = 'none'; document.getElementById('2410.07093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06369">arXiv:2410.06369</a> <span> [<a href="https://arxiv.org/pdf/2410.06369">pdf</a>, <a href="https://arxiv.org/format/2410.06369">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Communication-Efficient Federated Group Distributionally Robust Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhishuai Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06369v2-abstract-short" style="display: inline;"> Federated learning faces challenges due to the heterogeneity in data volumes and distributions at different clients, which can compromise model generalization ability to various distributions. Existing approaches to address this issue based on group distributionally robust optimization (GDRO) often lead to high communication and sample complexity. To this end, this work introduces algorithms tailo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06369v2-abstract-full').style.display = 'inline'; document.getElementById('2410.06369v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06369v2-abstract-full" style="display: none;"> Federated learning faces challenges due to the heterogeneity in data volumes and distributions at different clients, which can compromise model generalization ability to various distributions. Existing approaches to address this issue based on group distributionally robust optimization (GDRO) often lead to high communication and sample complexity. To this end, this work introduces algorithms tailored for communication-efficient Federated Group Distributionally Robust Optimization (FGDRO). Our contributions are threefold: Firstly, we introduce the FGDRO-CVaR algorithm, which optimizes the average top-K losses while reducing communication complexity to $O(1/蔚^4)$, where $蔚$ denotes the desired precision level. Secondly, our FGDRO-KL algorithm is crafted to optimize KL regularized FGDRO, cutting communication complexity to $O(1/蔚^3)$. Lastly, we propose FGDRO-KL-Adam to utilize Adam-type local updates in FGDRO-KL, which not only maintains a communication cost of $O(1/蔚^3)$ but also shows potential to surpass SGD-type local steps in practical applications. The effectiveness of our algorithms has been demonstrated on a variety of real-world tasks, including natural language processing and computer vision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06369v2-abstract-full').style.display = 'none'; document.getElementById('2410.06369v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05939">arXiv:2410.05939</a> <span> [<a href="https://arxiv.org/pdf/2410.05939">pdf</a>, <a href="https://arxiv.org/format/2410.05939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> RLRF4Rec: Reinforcement Learning from Recsys Feedback for Enhanced Recommendation Reranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+C">Chao Sun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yaobo Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaming Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shilin Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianmeng Yang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yunhai Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05939v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable performance across diverse domains, prompting researchers to explore their potential for use in recommendation systems. Initial attempts have leveraged the exceptional capabilities of LLMs, such as rich knowledge and strong generalization through In-context Learning, which involves phrasing the recommendation task as prompts. Nevertheless,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05939v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05939v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05939v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable performance across diverse domains, prompting researchers to explore their potential for use in recommendation systems. Initial attempts have leveraged the exceptional capabilities of LLMs, such as rich knowledge and strong generalization through In-context Learning, which involves phrasing the recommendation task as prompts. Nevertheless, the performance of LLMs in recommendation tasks remains suboptimal due to a substantial disparity between the training tasks for LLMs and recommendation tasks and inadequate recommendation data during pre-training. This paper introduces RLRF4Rec, a novel framework integrating Reinforcement Learning from Recsys Feedback for Enhanced Recommendation Reranking(RLRF4Rec) with LLMs to address these challenges. Specifically, We first have the LLM generate inferred user preferences based on user interaction history, which is then used to augment traditional ID-based sequence recommendation models. Subsequently, we trained a reward model based on knowledge augmentation recommendation models to evaluate the quality of the reasoning knowledge from LLM. We then select the best and worst responses from the N samples to construct a dataset for LLM tuning. Finally, we design a structure alignment strategy with Direct Preference Optimization(DPO). We validate the effectiveness of RLRF4Rec through extensive experiments, demonstrating significant improvements in recommendation re-ranking metrics compared to baselines. This demonstrates that our approach significantly improves the capability of LLMs to respond to instructions within recommender systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05939v1-abstract-full').style.display = 'none'; document.getElementById('2410.05939v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03955">arXiv:2410.03955</a> <span> [<a href="https://arxiv.org/pdf/2410.03955">pdf</a>, <a href="https://arxiv.org/format/2410.03955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Model Developmental Safety: A Safety-Centric Method and Applications in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wendi Yu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+W">Wei Tong</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingbin Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qihang Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03955v2-abstract-short" style="display: inline;"> In the real world, a learning-enabled system usually undergoes multiple cycles of model development to enhance the system's ability to handle difficult or emerging tasks. This continual model development process raises a significant issue that the model development for acquiring new or improving existing capabilities may inadvertently lose capabilities of the old model, also known as catastrophic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03955v2-abstract-full').style.display = 'inline'; document.getElementById('2410.03955v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03955v2-abstract-full" style="display: none;"> In the real world, a learning-enabled system usually undergoes multiple cycles of model development to enhance the system's ability to handle difficult or emerging tasks. This continual model development process raises a significant issue that the model development for acquiring new or improving existing capabilities may inadvertently lose capabilities of the old model, also known as catastrophic forgetting. Existing continual learning studies focus on mitigating catastrophic forgetting by trading off performance on previous tasks and new tasks to ensure good average performance. However, they are inadequate for many applications especially in safety-critical domains, as failure to strictly preserve the performance of the old model not only introduces safety risks and uncertainties but also imposes substantial expenses in the re-improving and re-validation of existing properties. To address this issue, we introduce model developmental safety as a guarantee of a learning system such that in the model development process the new model should strictly preserve the existing protected capabilities of the old model while improving its performance on target tasks. To ensure the model developmental safety, we present a safety-centric framework by formulating the model developmental safety as data-dependent constraints. Under this framework, we study how to develop a pretrained vision-language model (aka the CLIP model) for acquiring new capabilities or improving existing capabilities of image classification. We propose an efficient constrained optimization algorithm with theoretical guarantee and use its insights to finetune a CLIP model with task-dependent heads for promoting the model developmental safety. Our experiments on improving vision perception capabilities on autonomous driving and scene recognition datasets demonstrate the efficacy of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03955v2-abstract-full').style.display = 'none'; document.getElementById('2410.03955v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18486">arXiv:2409.18486</a> <span> [<a href="https://arxiv.org/pdf/2409.18486">pdf</a>, <a href="https://arxiv.org/format/2409.18486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluation of OpenAI o1: Opportunities and Challenges of AGI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhong%2C+T">Tianyang Zhong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yi Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yutong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shizhe Liang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yanjun Lyu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaowei Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chao Cao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanxu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huawen Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yihen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shaochen Xu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Haixing Dai</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lin Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingyuan Chen</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18486v1-abstract-short" style="display: inline;"> This comprehensive study evaluates the performance of OpenAI's o1-preview large language model across a diverse array of complex reasoning tasks, spanning multiple domains, including computer science, mathematics, natural sciences, medicine, linguistics, and social sciences. Through rigorous testing, o1-preview demonstrated remarkable capabilities, often achieving human-level or superior performan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18486v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18486v1-abstract-full" style="display: none;"> This comprehensive study evaluates the performance of OpenAI's o1-preview large language model across a diverse array of complex reasoning tasks, spanning multiple domains, including computer science, mathematics, natural sciences, medicine, linguistics, and social sciences. Through rigorous testing, o1-preview demonstrated remarkable capabilities, often achieving human-level or superior performance in areas ranging from coding challenges to scientific reasoning and from language processing to creative problem-solving. Key findings include: -83.3% success rate in solving complex competitive programming problems, surpassing many human experts. -Superior ability in generating coherent and accurate radiology reports, outperforming other evaluated models. -100% accuracy in high school-level mathematical reasoning tasks, providing detailed step-by-step solutions. -Advanced natural language inference capabilities across general and specialized domains like medicine. -Impressive performance in chip design tasks, outperforming specialized models in areas such as EDA script generation and bug analysis. -Remarkable proficiency in anthropology and geology, demonstrating deep understanding and reasoning in these specialized fields. -Strong capabilities in quantitative investing. O1 has comprehensive financial knowledge and statistical modeling skills. -Effective performance in social media analysis, including sentiment analysis and emotion recognition. The model excelled particularly in tasks requiring intricate reasoning and knowledge integration across various fields. While some limitations were observed, including occasional errors on simpler problems and challenges with certain highly specialized concepts, the overall results indicate significant progress towards artificial general intelligence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18486v1-abstract-full').style.display = 'none'; document.getElementById('2409.18486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18295">arXiv:2409.18295</a> <span> [<a href="https://arxiv.org/pdf/2409.18295">pdf</a>, <a href="https://arxiv.org/format/2409.18295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Lossy Compression Through Cross-Field Information for Scientific Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Youyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Wenqi Jia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Taolue Yang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Miao Yin</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+S">Sian Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18295v1-abstract-short" style="display: inline;"> Lossy compression is one of the most effective methods for reducing the size of scientific data containing multiple data fields. It reduces information density through prediction or transformation techniques to compress the data. Previous approaches use local information from a single target field when predicting target data points, limiting their potential to achieve higher compression ratios. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18295v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18295v1-abstract-full" style="display: none;"> Lossy compression is one of the most effective methods for reducing the size of scientific data containing multiple data fields. It reduces information density through prediction or transformation techniques to compress the data. Previous approaches use local information from a single target field when predicting target data points, limiting their potential to achieve higher compression ratios. In this paper, we identified significant cross-field correlations within scientific datasets. We propose a novel hybrid prediction model that utilizes CNN to extract cross-field information and combine it with existing local field information. Our solution enhances the prediction accuracy of lossy compressors, leading to improved compression ratios without compromising data quality. We evaluate our solution on three scientific datasets, demonstrating its ability to improve compression ratios by up to 25% under specific error bounds. Additionally, our solution preserves more data details and reduces artifacts compared to baseline approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18295v1-abstract-full').style.display = 'none'; document.getElementById('2409.18295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 9 figures, accepted by DRBSD-10</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18293">arXiv:2409.18293</a> <span> [<a href="https://arxiv.org/pdf/2409.18293">pdf</a>, <a href="https://arxiv.org/format/2409.18293">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Safe and Efficient Through-the-Canopy Autonomous Fruit Counting with UAVs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Teaya Yang</a>, <a href="/search/cs?searchtype=author&query=Ibrahimov%2C+R">Roman Ibrahimov</a>, <a href="/search/cs?searchtype=author&query=Mueller%2C+M+W">Mark W. Mueller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18293v1-abstract-short" style="display: inline;"> We present an autonomous aerial system for safe and efficient through-the-canopy fruit counting. Aerial robot applications in large-scale orchards face significant challenges due to the complexity of fine-tuning flight paths based on orchard layouts, canopy density, and plant variability. Through-the-canopy navigation is crucial for minimizing occlusion by leaves and branches but is more challengi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18293v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18293v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18293v1-abstract-full" style="display: none;"> We present an autonomous aerial system for safe and efficient through-the-canopy fruit counting. Aerial robot applications in large-scale orchards face significant challenges due to the complexity of fine-tuning flight paths based on orchard layouts, canopy density, and plant variability. Through-the-canopy navigation is crucial for minimizing occlusion by leaves and branches but is more challenging due to the complex and dense environment compared to traditional over-the-canopy flights. Our system addresses these challenges by integrating: i) a high-fidelity simulation framework for optimizing flight trajectories, ii) a low-cost autonomy stack for canopy-level navigation and data collection, and iii) a robust workflow for fruit detection and counting using RGB images. We validate our approach through fruit counting with canopy-level aerial images and by demonstrating the autonomous navigation capabilities of our experimental vehicle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18293v1-abstract-full').style.display = 'none'; document.getElementById('2409.18293v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16997">arXiv:2409.16997</a> <span> [<a href="https://arxiv.org/pdf/2409.16997">pdf</a>, <a href="https://arxiv.org/format/2409.16997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> INT-FlashAttention: Enabling Flash Attention for INT8 Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shimao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhiying Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Ce Zheng</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+P">Peizhuang Cong</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhan Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+L">Lei Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16997v2-abstract-short" style="display: inline;"> As the foundation of large language models (LLMs), self-attention module faces the challenge of quadratic time and memory complexity with respect to sequence length. FlashAttention accelerates attention computation and reduces its memory usage by leveraging the GPU memory hierarchy. A promising research direction is to integrate FlashAttention with quantization methods. This paper introduces INT-F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16997v2-abstract-full').style.display = 'inline'; document.getElementById('2409.16997v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16997v2-abstract-full" style="display: none;"> As the foundation of large language models (LLMs), self-attention module faces the challenge of quadratic time and memory complexity with respect to sequence length. FlashAttention accelerates attention computation and reduces its memory usage by leveraging the GPU memory hierarchy. A promising research direction is to integrate FlashAttention with quantization methods. This paper introduces INT-FlashAttention, the first INT8 quantization architecture compatible with the forward workflow of FlashAttention, which significantly improves the inference speed of FlashAttention on Ampere GPUs. We implement our INT-FlashAttention prototype with fully INT8 activations and general matrix-multiplication (GEMM) kernels, making it the first attention operator with fully INT8 input. As a general token-level post-training quantization framework, INT-FlashAttention is also compatible with other data formats like INT4, etc. Experimental results show INT-FlashAttention achieves 72% faster inference speed and 82% smaller quantization error compared to standard FlashAttention with FP16 and FP8 data format. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16997v2-abstract-full').style.display = 'none'; document.getElementById('2409.16997v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16678">arXiv:2409.16678</a> <span> [<a href="https://arxiv.org/pdf/2409.16678">pdf</a>, <a href="https://arxiv.org/format/2409.16678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TSBP: Improving Object Detection in Histology Images via Test-time Self-guided Bounding-box Propagation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tingting Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+L">Liang Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yizhe Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16678v1-abstract-short" style="display: inline;"> A global threshold (e.g., 0.5) is often applied to determine which bounding boxes should be included in the final results for an object detection task. A higher threshold reduces false positives but may result in missing a significant portion of true positives. A lower threshold can increase detection recall but may also result in more false positives. Because of this, using a preset global thresh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16678v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16678v1-abstract-full" style="display: none;"> A global threshold (e.g., 0.5) is often applied to determine which bounding boxes should be included in the final results for an object detection task. A higher threshold reduces false positives but may result in missing a significant portion of true positives. A lower threshold can increase detection recall but may also result in more false positives. Because of this, using a preset global threshold (e.g., 0.5) applied to all the bounding box candidates may lead to suboptimal solutions. In this paper, we propose a Test-time Self-guided Bounding-box Propagation (TSBP) method, leveraging Earth Mover's Distance (EMD) to enhance object detection in histology images. TSBP utilizes bounding boxes with high confidence to influence those with low confidence, leveraging visual similarities between them. This propagation mechanism enables bounding boxes to be selected in a controllable, explainable, and robust manner, which surpasses the effectiveness of using simple thresholds and uncertainty calibration methods. Importantly, TSBP does not necessitate additional labeled samples for model training or parameter estimation, unlike calibration methods. We conduct experiments on gland detection and cell detection tasks in histology images. The results show that our proposed TSBP significantly improves detection outcomes when working in conjunction with state-of-the-art deep learning-based detection networks. Compared to other methods such as uncertainty calibration, TSBP yields more robust and accurate object detection predictions while using no additional labeled samples. The code is available at https://github.com/jwhgdeu/TSBP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16678v1-abstract-full').style.display = 'none'; document.getElementById('2409.16678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15239">arXiv:2409.15239</a> <span> [<a href="https://arxiv.org/pdf/2409.15239">pdf</a>, <a href="https://arxiv.org/format/2409.15239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JSEN.2024.3471812">10.1109/JSEN.2024.3471812 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> TacPalm: A Soft Gripper with a Biomimetic Optical Tactile Palm for Stable Precise Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianqi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dandan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lepora%2C+N+F">Nathan F. Lepora</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15239v1-abstract-short" style="display: inline;"> Manipulating fragile objects in environments such as homes and factories requires stable and gentle grasping along with precise and safe placement. Compared to traditional rigid grippers, the use of soft grippers reduces the control complexity and the risk of damaging objects. However, it is challenging to integrate camera-based optical tactile sensing into a soft gripper without compromising the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15239v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15239v1-abstract-full" style="display: none;"> Manipulating fragile objects in environments such as homes and factories requires stable and gentle grasping along with precise and safe placement. Compared to traditional rigid grippers, the use of soft grippers reduces the control complexity and the risk of damaging objects. However, it is challenging to integrate camera-based optical tactile sensing into a soft gripper without compromising the flexibility and adaptability of the fingers, while also ensuring that the precision of tactile perception remains unaffected by passive deformations of the soft structure during object contact. In this paper, we demonstrate a modular soft two-fingered gripper with a 3D-printed optical tactile sensor (the TacTip) integrated in the palm. We propose a soft-grasping strategy that includes three functions: light contact detection, grasp pose adjustment and loss-of-contact detection, so that objects of different shapes and sizes can be grasped stably and placed precisely, which we test with both artificial and household objects. By sequentially implementing these three functions, the grasp success rate progressively improves from 45% without any functions, to 59% with light contact detection, 90% with grasp pose adjustment, and 97% with loss-of-contact detection, achieving a sub-millimeter placement precision. Overall, this work demonstrates the feasibility and utility of integrating optical tactile sensors into the palm of a soft gripper, of applicability to various types of soft manipulators. The proposed grasping strategy has potential applications in areas such as fragile product processing and home assistance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15239v1-abstract-full').style.display = 'none'; document.getElementById('2409.15239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13790">arXiv:2409.13790</a> <span> [<a href="https://arxiv.org/pdf/2409.13790">pdf</a>, <a href="https://arxiv.org/format/2409.13790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Synthetic Human Trajectories: Imitative Generation and Benchmarks Beyond Datasaurus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+B">Bangchao Deng</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+X">Xin Jing</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianyue Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+B">Bingqing Qu</a>, <a href="/search/cs?searchtype=author&query=Cudre-Mauroux%2C+P">Philippe Cudre-Mauroux</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dingqi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13790v1-abstract-short" style="display: inline;"> Human trajectory data, which plays a crucial role in various applications such as crowd management and epidemic prevention, is challenging to obtain due to practical constraints and privacy concerns. In this context, synthetic human trajectory data is generated to simulate as close as possible to real-world human trajectories, often under summary statistics and distributional similarities. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13790v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13790v1-abstract-full" style="display: none;"> Human trajectory data, which plays a crucial role in various applications such as crowd management and epidemic prevention, is challenging to obtain due to practical constraints and privacy concerns. In this context, synthetic human trajectory data is generated to simulate as close as possible to real-world human trajectories, often under summary statistics and distributional similarities. However, the complexity of human mobility patterns is oversimplified by these similarities (a.k.a. ``Datasaurus''), resulting in intrinsic biases in both generative model design and benchmarks of the generated trajectories. Against this background, we propose MIRAGE, a huMan-Imitative tRAjectory GenErative model designed as a neural Temporal Point Process integrating an Exploration and Preferential Return model. It imitates the human decision-making process in trajectory generation, rather than fitting any specific statistical distributions as traditional methods do, thus avoiding the Datasaurus issue. Moreover, we also propose a comprehensive task-based evaluation protocol beyond Datasaurus to systematically benchmark trajectory generative models on four typical downstream tasks, integrating multiple techniques and evaluation metrics for each task, to comprehensively assess the ultimate utility of the generated trajectories. We conduct a thorough evaluation of MIRAGE on three real-world user trajectory datasets against a sizeable collection of baselines. Results show that compared to the best baselines, MIRAGE-generated trajectory data not only achieves the best statistical and distributional similarities with 59.0-71.5% improvement, but also yields the best performance in the task-based evaluation with 10.9-33.4% improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13790v1-abstract-full').style.display = 'none'; document.getElementById('2409.13790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13000">arXiv:2409.13000</a> <span> [<a href="https://arxiv.org/pdf/2409.13000">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Introducing the Large Medical Model: State of the art healthcare cost and risk prediction with transformers trained on patient event sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahu%2C+R">Ricky Sahu</a>, <a href="/search/cs?searchtype=author&query=Marriott%2C+E">Eric Marriott</a>, <a href="/search/cs?searchtype=author&query=Siegel%2C+E">Ethan Siegel</a>, <a href="/search/cs?searchtype=author&query=Wagner%2C+D">David Wagner</a>, <a href="/search/cs?searchtype=author&query=Uzan%2C+F">Flore Uzan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Troy Yang</a>, <a href="/search/cs?searchtype=author&query=Javed%2C+A">Asim Javed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13000v1-abstract-short" style="display: inline;"> With U.S. healthcare spending approaching $5T (NHE Fact Sheet 2024), and 25% of it estimated to be wasteful (Waste in the US the health care system: estimated costs and potential for savings, n.d.), the need to better predict risk and optimal patient care is evermore important. This paper introduces the Large Medical Model (LMM), a generative pre-trained transformer (GPT) designed to guide and pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13000v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13000v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13000v1-abstract-full" style="display: none;"> With U.S. healthcare spending approaching $5T (NHE Fact Sheet 2024), and 25% of it estimated to be wasteful (Waste in the US the health care system: estimated costs and potential for savings, n.d.), the need to better predict risk and optimal patient care is evermore important. This paper introduces the Large Medical Model (LMM), a generative pre-trained transformer (GPT) designed to guide and predict the broad facets of patient care and healthcare administration. The model is trained on medical event sequences from over 140M longitudinal patient claims records with a specialized vocabulary built from medical terminology systems and demonstrates a superior capability to forecast healthcare costs and identify potential risk factors. Through experimentation and validation, we showcase the LMM's proficiency in not only in cost and risk predictions, but also in discerning intricate patterns within complex medical conditions and an ability to identify novel relationships in patient care. The LMM is able to improve both cost prediction by 14.1% over the best commercial models and chronic conditions prediction by 1.9% over the best transformer models in research predicting a broad set of conditions. The LMM is a substantial advancement in healthcare analytics, offering the potential to significantly enhance risk assessment, cost management, and personalized medicine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13000v1-abstract-full').style.display = 'none'; document.getElementById('2409.13000v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 10 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1; K.4.1; K.4.3; J.1; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11689">arXiv:2409.11689</a> <span> [<a href="https://arxiv.org/pdf/2409.11689">pdf</a>, <a href="https://arxiv.org/format/2409.11689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GUNet: A Graph Convolutional Network United Diffusion Model for Stable and Diversity Pose Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shuowen Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sisi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kaiquan Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11689v1-abstract-short" style="display: inline;"> Pose skeleton images are an important reference in pose-controllable image generation. In order to enrich the source of skeleton images, recent works have investigated the generation of pose skeletons based on natural language. These methods are based on GANs. However, it remains challenging to perform diverse, structurally correct and aesthetically pleasing human pose skeleton generation with var… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11689v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11689v1-abstract-full" style="display: none;"> Pose skeleton images are an important reference in pose-controllable image generation. In order to enrich the source of skeleton images, recent works have investigated the generation of pose skeletons based on natural language. These methods are based on GANs. However, it remains challenging to perform diverse, structurally correct and aesthetically pleasing human pose skeleton generation with various textual inputs. To address this problem, we propose a framework with GUNet as the main model, PoseDiffusion. It is the first generative framework based on a diffusion model and also contains a series of variants fine-tuned based on a stable diffusion model. PoseDiffusion demonstrates several desired properties that outperform existing methods. 1) Correct Skeletons. GUNet, a denoising model of PoseDiffusion, is designed to incorporate graphical convolutional neural networks. It is able to learn the spatial relationships of the human skeleton by introducing skeletal information during the training process. 2) Diversity. We decouple the key points of the skeleton and characterise them separately, and use cross-attention to introduce textual conditions. Experimental results show that PoseDiffusion outperforms existing SoTA algorithms in terms of stability and diversity of text-driven pose skeleton generation. Qualitative analyses further demonstrate its superiority for controllable generation in Stable Diffusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11689v1-abstract-full').style.display = 'none'; document.getElementById('2409.11689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11650">arXiv:2409.11650</a> <span> [<a href="https://arxiv.org/pdf/2409.11650">pdf</a>, <a href="https://arxiv.org/format/2409.11650">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Art and Science of Quantizing Large-Scale Models: A Comprehensive Overview </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanshu Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiyan Liang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hanning Lu</a>, <a href="/search/cs?searchtype=author&query=Zhe%2C+X">Xu Zhe</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaoming Li</a>, <a href="/search/cs?searchtype=author&query=Weitao%2C+L">Li Weitao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11650v1-abstract-short" style="display: inline;"> This paper provides a comprehensive overview of the principles, challenges, and methodologies associated with quantizing large-scale neural network models. As neural networks have evolved towards larger and more complex architectures to address increasingly sophisticated tasks, the computational and energy costs have escalated significantly. We explore the necessity and impact of model size growth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11650v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11650v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11650v1-abstract-full" style="display: none;"> This paper provides a comprehensive overview of the principles, challenges, and methodologies associated with quantizing large-scale neural network models. As neural networks have evolved towards larger and more complex architectures to address increasingly sophisticated tasks, the computational and energy costs have escalated significantly. We explore the necessity and impact of model size growth, highlighting the performance benefits as well as the computational challenges and environmental considerations. The core focus is on model quantization as a fundamental approach to mitigate these challenges by reducing model size and improving efficiency without substantially compromising accuracy. We delve into various quantization techniques, including both post-training quantization (PTQ) and quantization-aware training (QAT), and analyze several state-of-the-art algorithms such as LLM-QAT, PEQA(L4Q), ZeroQuant, SmoothQuant, and others. Through comparative analysis, we examine how these methods address issues like outliers, importance weighting, and activation quantization, ultimately contributing to more sustainable and accessible deployment of large-scale models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11650v1-abstract-full').style.display = 'none'; document.getElementById('2409.11650v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09444">arXiv:2409.09444</a> <span> [<a href="https://arxiv.org/pdf/2409.09444">pdf</a>, <a href="https://arxiv.org/format/2409.09444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KAN-HyperpointNet for Point Cloud Sequence-Based 3D Human Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xing Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qian Huang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Q">Qiang Geng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianjin Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shihao Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09444v1-abstract-short" style="display: inline;"> Point cloud sequence-based 3D action recognition has achieved impressive performance and efficiency. However, existing point cloud sequence modeling methods cannot adequately balance the precision of limb micro-movements with the integrity of posture macro-structure, leading to the loss of crucial information cues in action inference. To overcome this limitation, we introduce D-Hyperpoint, a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09444v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09444v1-abstract-full" style="display: none;"> Point cloud sequence-based 3D action recognition has achieved impressive performance and efficiency. However, existing point cloud sequence modeling methods cannot adequately balance the precision of limb micro-movements with the integrity of posture macro-structure, leading to the loss of crucial information cues in action inference. To overcome this limitation, we introduce D-Hyperpoint, a novel data type generated through a D-Hyperpoint Embedding module. D-Hyperpoint encapsulates both regional-momentary motion and global-static posture, effectively summarizing the unit human action at each moment. In addition, we present a D-Hyperpoint KANsMixer module, which is recursively applied to nested groupings of D-Hyperpoints to learn the action discrimination information and creatively integrates Kolmogorov-Arnold Networks (KAN) to enhance spatio-temporal interaction within D-Hyperpoints. Finally, we propose KAN-HyperpointNet, a spatio-temporal decoupled network architecture for 3D action recognition. Extensive experiments on two public datasets: MSR Action3D and NTU-RGB+D 60, demonstrate the state-of-the-art performance of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09444v1-abstract-full').style.display = 'none'; document.getElementById('2409.09444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09345">arXiv:2409.09345</a> <span> [<a href="https://arxiv.org/pdf/2409.09345">pdf</a>, <a href="https://arxiv.org/format/2409.09345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Decision-Making for LLM Agents via Step-Level Q-Value Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yuanzhao Zhai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tingkai Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kele Xu</a>, <a href="/search/cs?searchtype=author&query=Dawei%2C+F">Feng Dawei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bo Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huaimin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09345v1-abstract-short" style="display: inline;"> Agents significantly enhance the capabilities of standalone Large Language Models (LLMs) by perceiving environments, making decisions, and executing actions. However, LLM agents still face challenges in tasks that require multiple decision-making steps. Estimating the value of actions in specific tasks is difficult when intermediate actions are neither appropriately rewarded nor penalized. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09345v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09345v1-abstract-full" style="display: none;"> Agents significantly enhance the capabilities of standalone Large Language Models (LLMs) by perceiving environments, making decisions, and executing actions. However, LLM agents still face challenges in tasks that require multiple decision-making steps. Estimating the value of actions in specific tasks is difficult when intermediate actions are neither appropriately rewarded nor penalized. In this paper, we propose leveraging a task-relevant Q-value model to guide action selection. Specifically, we first collect decision-making trajectories annotated with step-level Q values via Monte Carlo Tree Search (MCTS) and construct preference data. We then use another LLM to fit these preferences through step-level Direct Policy Optimization (DPO), which serves as the Q-value model. During inference, at each decision-making step, LLM agents select the action with the highest Q value before interacting with the environment. We apply our method to various open-source and API-based LLM agents, demonstrating that Q-value models significantly improve their performance. Notably, the performance of the agent built with Phi-3-mini-4k-instruct improved by 103% on WebShop and 75% on HotPotQA when enhanced with Q-value models, even surpassing GPT-4o-mini. Additionally, Q-value models offer several advantages, such as generalization to different LLM agents and seamless integration with existing prompting strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09345v1-abstract-full').style.display = 'none'; document.getElementById('2409.09345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01887">arXiv:2409.01887</a> <span> [<a href="https://arxiv.org/pdf/2409.01887">pdf</a>, <a href="https://arxiv.org/format/2409.01887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Detecting and Measuring Security Implications of Entangled Domain Verification in CDN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Ziyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiwei Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Run Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianjun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Ximeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianhao Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhuoran Cao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R+H">Robert H. Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01887v1-abstract-short" style="display: inline;"> Content Delivery Networks (CDNs) offer a protection layer for enhancing the security of websites. However, a significant security flaw named Absence of Domain Verification (DVA) has become emerging recently. Although this threat is recognized, the current practices and security flaws of domain verification strategies in CDNs have not been thoroughly investigated. In this paper, we present DVAHunte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01887v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01887v1-abstract-full" style="display: none;"> Content Delivery Networks (CDNs) offer a protection layer for enhancing the security of websites. However, a significant security flaw named Absence of Domain Verification (DVA) has become emerging recently. Although this threat is recognized, the current practices and security flaws of domain verification strategies in CDNs have not been thoroughly investigated. In this paper, we present DVAHunter, an automated system for detecting DVA vulnerabilities that can lead to domain abuse in CDNs. Our evaluation of 45 major CDN providers reveals the prevalence of DVA: most (39/45) providers do not perform any verification, and even those that do remain exploitable. Additionally, we used DVAHunter to conduct a large-scale measurement of 89M subdomains from Tranco's Top 1M sites hosted on the 45 CDNs under evaluation. Our focus was on two primary DVA exploitation scenarios: covert communication and domain hijacking. We identified over 332K subdomains vulnerable to domain abuse. This tool provides deeper insights into DVA exploitation and allows us to propose viable mitigation practices for CDN providers. To date, we have received vulnerability confirmations from 12 providers; 6 (e.g., Edgio, Kuocai) have implemented fixes, and 1 (ChinaNetCenter) are actively working on solutions based on our recommendations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01887v1-abstract-full').style.display = 'none'; document.getElementById('2409.01887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00941">arXiv:2409.00941</a> <span> [<a href="https://arxiv.org/pdf/2409.00941">pdf</a>, <a href="https://arxiv.org/format/2409.00941">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Frequency-Position-Fluid Antenna Array for Ultra-dense Connectivity in Terahertz Beamforming Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+H">Heyin Shen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chong Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00941v1-abstract-short" style="display: inline;"> The position-fluid antenna (PFA) architecture has become one of the appealing technologies to support ubiquitous connectivity demand in next-generation wireless systems. Specifically, allowing the antenna to adjust its physical position to one of the predefined ports within a fixed region can introduce additional spatial diversity and improve the signal-to-interference-plus-noise ratio (SINR). In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00941v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00941v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00941v1-abstract-full" style="display: none;"> The position-fluid antenna (PFA) architecture has become one of the appealing technologies to support ubiquitous connectivity demand in next-generation wireless systems. Specifically, allowing the antenna to adjust its physical position to one of the predefined ports within a fixed region can introduce additional spatial diversity and improve the signal-to-interference-plus-noise ratio (SINR). In addition, frequency diversity is also widely-explored through frequency interleaving in the terahertz (THz) band. However, the operating bandwidth of one antenna is usually limited to 10% of the central frequency, which imposes a waste of the ultra-broad bandwidth in the THz band. In light of this, a frequency-position-fluid antenna (FPFA) system is proposed in this paper to facilitate ultra-dense connectivity. Specifically, antennas with non-overlapping operating frequency ranges are deployed at the base station (BS) to expand the total available bandwidth and provide frequency domain diversity, while the PFA-enabled users are capable of providing the spatial domain diversity. The channel model is first derived, based on which a channel correlation-based frequency allocation strategy is proposed. Then, a minimum-projection-based port selection algorithm is developed with singular-value-decomposition (SVD) precoders. Simulation results show that the proposed FPFA architecture exhibits steady performance with an increasing number of users, and outperforms the PFA and the fixed-antenna system in ultra-dense user deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00941v1-abstract-full').style.display = 'none'; document.getElementById('2409.00941v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00553">arXiv:2409.00553</a> <span> [<a href="https://arxiv.org/pdf/2409.00553">pdf</a>, <a href="https://arxiv.org/format/2409.00553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Multi-Output Distributional Fairness via Post-Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qihang Lin</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+A">Ayush Ghosh</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianbao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00553v1-abstract-short" style="display: inline;"> The post-processing approaches are becoming prominent techniques to enhance machine learning models' fairness because of their intuitiveness, low computational cost, and excellent scalability. However, most existing post-processing methods are designed for task-specific fairness measures and are limited to single-output models. In this paper, we introduce a post-processing method for multi-output… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00553v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00553v1-abstract-full" style="display: none;"> The post-processing approaches are becoming prominent techniques to enhance machine learning models' fairness because of their intuitiveness, low computational cost, and excellent scalability. However, most existing post-processing methods are designed for task-specific fairness measures and are limited to single-output models. In this paper, we introduce a post-processing method for multi-output models, such as the ones used for multi-task/multi-class classification and representation learning, to enhance a model's distributional parity, a task-agnostic fairness measure. Existing techniques to achieve distributional parity are based on the (inverse) cumulative density function of a model's output, which is limited to single-output models. Extending previous works, our method employs an optimal transport mapping to move a model's outputs across different groups towards their empirical Wasserstein barycenter. An approximation technique is applied to reduce the complexity of computing the exact barycenter and a kernel regression method is proposed for extending this process to out-of-sample data. Our empirical studies, which compare our method to current existing post-processing baselines on multi-task/multi-class classification and representation learning tasks, demonstrate the effectiveness of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00553v1-abstract-full').style.display = 'none'; document.getElementById('2409.00553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13054">arXiv:2408.13054</a> <span> [<a href="https://arxiv.org/pdf/2408.13054">pdf</a>, <a href="https://arxiv.org/format/2408.13054">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> cc-DRL: a Convex Combined Deep Reinforcement Learning Flight Control Design for a Morphing Quadrotor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Huai-Ning Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun-Wei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13054v1-abstract-short" style="display: inline;"> In comparison to common quadrotors, the shape change of morphing quadrotors endows it with a more better flight performance but also results in more complex flight dynamics. Generally, it is extremely difficult or even impossible for morphing quadrotors to establish an accurate mathematical model describing their complex flight dynamics. To figure out the issue of flight control design for morphin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13054v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13054v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13054v1-abstract-full" style="display: none;"> In comparison to common quadrotors, the shape change of morphing quadrotors endows it with a more better flight performance but also results in more complex flight dynamics. Generally, it is extremely difficult or even impossible for morphing quadrotors to establish an accurate mathematical model describing their complex flight dynamics. To figure out the issue of flight control design for morphing quadrotors, this paper resorts to a combination of model-free control techniques (e.g., deep reinforcement learning, DRL) and convex combination (CC) technique, and proposes a convex-combined-DRL (cc-DRL) flight control algorithm for position and attitude of a class of morphing quadrotors, where the shape change is realized by the length variation of four arm rods. In the proposed cc-DRL flight control algorithm, proximal policy optimization algorithm that is a model-free DRL algorithm is utilized to off-line train the corresponding optimal flight control laws for some selected representative arm length modes and hereby a cc-DRL flight control scheme is constructed by the convex combination technique. Finally, simulation results are presented to show the effectiveness and merit of the proposed flight control algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13054v1-abstract-full').style.display = 'none'; document.getElementById('2408.13054v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12821">arXiv:2408.12821</a> <span> [<a href="https://arxiv.org/pdf/2408.12821">pdf</a>, <a href="https://arxiv.org/format/2408.12821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Examining the Commitments and Difficulties Inherent in Multimodal Foundation Models for Street View Imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuhui Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qinyi He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziye Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Law%2C+S">Stephen Law</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+G">Gengchen Mai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12821v1-abstract-short" style="display: inline;"> The emergence of Large Language Models (LLMs) and multimodal foundation models (FMs) has generated heightened interest in their applications that integrate vision and language. This paper investigates the capabilities of ChatGPT-4V and Gemini Pro for Street View Imagery, Built Environment, and Interior by evaluating their performance across various tasks. The assessments include street furniture i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12821v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12821v1-abstract-full" style="display: none;"> The emergence of Large Language Models (LLMs) and multimodal foundation models (FMs) has generated heightened interest in their applications that integrate vision and language. This paper investigates the capabilities of ChatGPT-4V and Gemini Pro for Street View Imagery, Built Environment, and Interior by evaluating their performance across various tasks. The assessments include street furniture identification, pedestrian and car counts, and road width measurement in Street View Imagery; building function classification, building age analysis, building height analysis, and building structure classification in the Built Environment; and interior room classification, interior design style analysis, interior furniture counts, and interior length measurement in Interior. The results reveal proficiency in length measurement, style analysis, question answering, and basic image understanding, but highlight limitations in detailed recognition and counting tasks. While zero-shot learning shows potential, performance varies depending on the problem domains and image complexities. This study provides new insights into the strengths and weaknesses of multimodal foundation models for practical challenges in Street View Imagery, Built Environment, and Interior. Overall, the findings demonstrate foundational multimodal intelligence, emphasizing the potential of FMs to drive forward interdisciplinary applications at the intersection of computer vision and language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12821v1-abstract-full').style.display = 'none'; document.getElementById('2408.12821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12687">arXiv:2408.12687</a> <span> [<a href="https://arxiv.org/pdf/2408.12687">pdf</a>, <a href="https://arxiv.org/format/2408.12687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Bridging the gap between natural user expression with complex automation programming in smart homes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yingtian Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chun Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianao Yang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Cheng Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+C">Chen Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanchun Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12687v1-abstract-short" style="display: inline;"> A long-standing challenge in end-user programming (EUP) is to trade off between natural user expression and the complexity of programming tasks. As large language models (LLMs) are empowered to handle semantic inference and natural language understanding, it remains under-explored how such capabilities can facilitate end-users to configure complex automation more naturally and easily. We propose A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12687v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12687v1-abstract-full" style="display: none;"> A long-standing challenge in end-user programming (EUP) is to trade off between natural user expression and the complexity of programming tasks. As large language models (LLMs) are empowered to handle semantic inference and natural language understanding, it remains under-explored how such capabilities can facilitate end-users to configure complex automation more naturally and easily. We propose AwareAuto, an EUP system that standardizes user expression and finishes two-step inference with the LLMs to achieve automation generation. AwareAuto allows contextual, multi-modality, and flexible user expression to configure complex automation tasks (e.g., dynamic parameters, multiple conditional branches, and temporal constraints), which are non-manageable in traditional EUP solutions. By studying realistic, complex rules data, AwareAuto gains 91.7% accuracy in matching user intentions and feasibility. We introduced user interaction to ensure system controllability and usability. We discuss the opportunities and challenges of incorporating LLMs in end-user programming techniques and grounding complex smart home contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12687v1-abstract-full').style.display = 'none'; document.getElementById('2408.12687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12073">arXiv:2408.12073</a> <span> [<a href="https://arxiv.org/pdf/2408.12073">pdf</a>, <a href="https://arxiv.org/format/2408.12073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Virgo: Cluster-level Matrix Unit Integration in GPUs for Scalability and Energy Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hansung Kim</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Ruohan Yan</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Joshua You</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T+V">Tieliang Vamber Yang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y+S">Yakun Sophia Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12073v1-abstract-short" style="display: inline;"> Modern GPUs incorporate specialized matrix units such as Tensor Cores to accelerate GEMM operations central to deep learning workloads. However, existing matrix unit designs are tightly coupled to the SIMT core, limiting the size and energy efficiency of the operation due to capacity and bandwidth constraints from the register file. Such a limitation in scalability makes it difficult to simultaneo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12073v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12073v1-abstract-full" style="display: none;"> Modern GPUs incorporate specialized matrix units such as Tensor Cores to accelerate GEMM operations central to deep learning workloads. However, existing matrix unit designs are tightly coupled to the SIMT core, limiting the size and energy efficiency of the operation due to capacity and bandwidth constraints from the register file. Such a limitation in scalability makes it difficult to simultaneously enhance compute throughput and improve energy efficiency in GPUs. To address this challenge, we propose Virgo, a new GPU microarchitecture that integrates dedicated matrix units at the SIMT core cluster level. By physically disaggregating the matrix unit from the SIMT core, Virgo eliminates scalability constraints imposed by the core microarchitecture. Consequently, Virgo increases the granularity of operations at the hardware which not only improves data reuse, but also reduces the number of instructions processed in the SIMT core. This reduction in instruction processing decreases energy consumption within the core pipeline, thereby improving the system-level energy efficiency. Our evaluations, implemented in synthesizable RTL, demonstrate that Virgo achieves up to 66.3% reduction in active power and 77.2% reduction in active energy consumption of the system-on-chip compared to the baseline core-coupled design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12073v1-abstract-full').style.display = 'none'; document.getElementById('2408.12073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 13 figures. Under review at ASPLOS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11873">arXiv:2408.11873</a> <span> [<a href="https://arxiv.org/pdf/2408.11873">pdf</a>, <a href="https://arxiv.org/format/2408.11873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Efficient Transfer Learning under Federated Learning for Automatic Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kan%2C+X">Xuan Kan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yonghui Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tien-Ju Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Nanxin Chen</a>, <a href="/search/cs?searchtype=author&query=Mathews%2C+R">Rajiv Mathews</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11873v1-abstract-short" style="display: inline;"> This work explores the challenge of enhancing Automatic Speech Recognition (ASR) model performance across various user-specific domains while preserving user data privacy. We employ federated learning and parameter-efficient domain adaptation methods to solve the (1) massive data requirement of ASR models from user-specific scenarios and (2) the substantial communication cost between servers and c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11873v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11873v1-abstract-full" style="display: none;"> This work explores the challenge of enhancing Automatic Speech Recognition (ASR) model performance across various user-specific domains while preserving user data privacy. We employ federated learning and parameter-efficient domain adaptation methods to solve the (1) massive data requirement of ASR models from user-specific scenarios and (2) the substantial communication cost between servers and clients during federated learning. We demonstrate that when equipped with proper adapters, ASR models under federated tuning can achieve similar performance compared with centralized tuning ones, thus providing a potential direction for future privacy-preserved ASR services. Besides, we investigate the efficiency of different adapters and adapter incorporation strategies under the federated learning setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11873v1-abstract-full').style.display = 'none'; document.getElementById('2408.11873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10147">arXiv:2408.10147</a> <span> [<a href="https://arxiv.org/pdf/2408.10147">pdf</a>, <a href="https://arxiv.org/format/2408.10147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> In-Context Learning with Representations: Contextual Generalization of Trained Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yu Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingbin Liang</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+Y">Yuejie Chi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10147v2-abstract-short" style="display: inline;"> In-context learning (ICL) refers to a remarkable capability of pretrained large language models, which can learn a new task given a few examples during inference. However, theoretical understanding of ICL is largely under-explored, particularly whether transformers can be trained to generalize to unseen examples in a prompt, which will require the model to acquire contextual knowledge of the promp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10147v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10147v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10147v2-abstract-full" style="display: none;"> In-context learning (ICL) refers to a remarkable capability of pretrained large language models, which can learn a new task given a few examples during inference. However, theoretical understanding of ICL is largely under-explored, particularly whether transformers can be trained to generalize to unseen examples in a prompt, which will require the model to acquire contextual knowledge of the prompt for generalization. This paper investigates the training dynamics of transformers by gradient descent through the lens of non-linear regression tasks. The contextual generalization here can be attained via learning the template function for each task in-context, where all template functions lie in a linear space with $m$ basis functions. We analyze the training dynamics of one-layer multi-head transformers to in-contextly predict unlabeled inputs given partially labeled prompts, where the labels contain Gaussian noise and the number of examples in each prompt are not sufficient to determine the template. Under mild assumptions, we show that the training loss for a one-layer multi-head transformer converges linearly to a global minimum. Moreover, the transformer effectively learns to perform ridge regression over the basis functions. To our knowledge, this study is the first provable demonstration that transformers can learn contextual (i.e., template) information to generalize to both unseen examples and tasks when prompts contain only a small number of query-answer pairs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10147v2-abstract-full').style.display = 'none'; document.getElementById('2408.10147v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10119">arXiv:2408.10119</a> <span> [<a href="https://arxiv.org/pdf/2408.10119">pdf</a>, <a href="https://arxiv.org/format/2408.10119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Factorized-Dreamer: Training A High-Quality Video Generator with Limited and Low-Quality Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yangming Shi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yunwen Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feng Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10119v1-abstract-short" style="display: inline;"> Text-to-video (T2V) generation has gained significant attention due to its wide applications to video generation, editing, enhancement and translation, \etc. However, high-quality (HQ) video synthesis is extremely challenging because of the diverse and complex motions existed in real world. Most existing works struggle to address this problem by collecting large-scale HQ videos, which are inaccess… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10119v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10119v1-abstract-full" style="display: none;"> Text-to-video (T2V) generation has gained significant attention due to its wide applications to video generation, editing, enhancement and translation, \etc. However, high-quality (HQ) video synthesis is extremely challenging because of the diverse and complex motions existed in real world. Most existing works struggle to address this problem by collecting large-scale HQ videos, which are inaccessible to the community. In this work, we show that publicly available limited and low-quality (LQ) data are sufficient to train a HQ video generator without recaptioning or finetuning. We factorize the whole T2V generation process into two steps: generating an image conditioned on a highly descriptive caption, and synthesizing the video conditioned on the generated image and a concise caption of motion details. Specifically, we present \emph{Factorized-Dreamer}, a factorized spatiotemporal framework with several critical designs for T2V generation, including an adapter to combine text and image embeddings, a pixel-aware cross attention module to capture pixel-level image information, a T5 text encoder to better understand motion description, and a PredictNet to supervise optical flows. We further present a noise schedule, which plays a key role in ensuring the quality and stability of video generation. Our model lowers the requirements in detailed captions and HQ videos, and can be directly trained on limited LQ datasets with noisy and brief captions such as WebVid-10M, largely alleviating the cost to collect large-scale HQ video-text pairs. Extensive experiments in a variety of T2V and image-to-video generation tasks demonstrate the effectiveness of our proposed Factorized-Dreamer. Our source codes are available at \url{https://github.com/yangxy/Factorized-Dreamer/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10119v1-abstract-full').style.display = 'none'; document.getElementById('2408.10119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>