Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 404 results for author: <span class="mathjax">Xia, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xia%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xia, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xia%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xia, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xia%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01455">arXiv:2412.01455</a> <span> [<a href="https://arxiv.org/pdf/2412.01455">pdf</a>, <a href="https://arxiv.org/format/2412.01455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Early Exit Is a Natural Capability in Transformer-based Models: An Empirical Study on Early Exit without Joint Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shan%2C+W">Weiqiao Shan</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+L">Long Meng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tong Zheng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yingfeng Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+j">junxin Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01455v1-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit exceptional performance across various downstream tasks. However, they encounter limitations due to slow inference speeds stemming from their extensive parameters. The early exit (EE) is an approach that aims to accelerate auto-regressive decoding. EE generates outputs from intermediate layers instead of using the whole model, which offers a promising solution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01455v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01455v1-abstract-full" style="display: none;"> Large language models (LLMs) exhibit exceptional performance across various downstream tasks. However, they encounter limitations due to slow inference speeds stemming from their extensive parameters. The early exit (EE) is an approach that aims to accelerate auto-regressive decoding. EE generates outputs from intermediate layers instead of using the whole model, which offers a promising solution to this challenge. However, additional output layers and joint optimization used in conventional EE hinder the application of EE in LLMs. In this paper, we explore the possibility of LLMs EE without additional output layers and joint optimization. Our findings indicate that EE is a natural capability within transformer-based models. While joint optimization does not give model EE capability, it must be employed to address challenges by improving the accuracy of locating the optimal EE layer through gating functions. Additionally, our study reveals patterns in EE behavior from a sub-word perspective based on the LLaMA model and the potential possibility for EE based on sub-layers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01455v1-abstract-full').style.display = 'none'; document.getElementById('2412.01455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01027">arXiv:2412.01027</a> <span> [<a href="https://arxiv.org/pdf/2412.01027">pdf</a>, <a href="https://arxiv.org/format/2412.01027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing In-context Learning of Autoregressive Models for Few-shot Image Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiaoliang Dai</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenguang Zhu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zeyi Huang</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Sangmin Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ning Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01027v1-abstract-short" style="display: inline;"> Text-guided image manipulation has experienced notable advancement in recent years. In order to mitigate linguistic ambiguity, few-shot learning with visual examples has been applied for instructions that are underrepresented in the training set, or difficult to describe purely in language. However, learning from visual prompts requires strong reasoning capability, which diffusion models are strug… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01027v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01027v1-abstract-full" style="display: none;"> Text-guided image manipulation has experienced notable advancement in recent years. In order to mitigate linguistic ambiguity, few-shot learning with visual examples has been applied for instructions that are underrepresented in the training set, or difficult to describe purely in language. However, learning from visual prompts requires strong reasoning capability, which diffusion models are struggling with. To address this issue, we introduce a novel multi-modal autoregressive model, dubbed $\textbf{InstaManip}$, that can $\textbf{insta}$ntly learn a new image $\textbf{manip}$ulation operation from textual and visual guidance via in-context learning, and apply it to new query images. Specifically, we propose an innovative group self-attention mechanism to break down the in-context learning process into two separate stages -- learning and applying, which simplifies the complex problem into two easier tasks. We also introduce a relation regularization method to further disentangle image transformation features from irrelevant contents in exemplar images. Extensive experiments suggest that our method surpasses previous few-shot image manipulation models by a notable margin ($\geq$19% in human evaluation). We also find our model can be further boosted by increasing the number or diversity of exemplar images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01027v1-abstract-full').style.display = 'none'; document.getElementById('2412.01027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 16 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13343">arXiv:2411.13343</a> <span> [<a href="https://arxiv.org/pdf/2411.13343">pdf</a>, <a href="https://arxiv.org/format/2411.13343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fact-Level Confidence Calibration and Self-Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yige Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bingbing Xu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hexiang Tan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fei Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13343v1-abstract-short" style="display: inline;"> Confidence calibration in LLMs, i.e., aligning their self-assessed confidence with the actual accuracy of their responses, enabling them to self-evaluate the correctness of their outputs. However, current calibration methods for LLMs typically estimate two scalars to represent overall response confidence and correctness, which is inadequate for long-form generation where the response includes mult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13343v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13343v1-abstract-full" style="display: none;"> Confidence calibration in LLMs, i.e., aligning their self-assessed confidence with the actual accuracy of their responses, enabling them to self-evaluate the correctness of their outputs. However, current calibration methods for LLMs typically estimate two scalars to represent overall response confidence and correctness, which is inadequate for long-form generation where the response includes multiple atomic facts and may be partially confident and correct. These methods also overlook the relevance of each fact to the query. To address these challenges, we propose a Fact-Level Calibration framework that operates at a finer granularity, calibrating confidence to relevance-weighted correctness at the fact level. Furthermore, comprehensive analysis under the framework inspired the development of Confidence-Guided Fact-level Self-Correction ($\textbf{ConFix}$), which uses high-confidence facts within a response as additional knowledge to improve low-confidence ones. Extensive experiments across four datasets and six models demonstrate that ConFix effectively mitigates hallucinations without requiring external knowledge sources such as retrieval systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13343v1-abstract-full').style.display = 'none'; document.getElementById('2411.13343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at https://github.com/yuanyige/fact-calibration</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11938">arXiv:2411.11938</a> <span> [<a href="https://arxiv.org/pdf/2411.11938">pdf</a>, <a href="https://arxiv.org/format/2411.11938">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Newclid: A User-Friendly Replacement for AlphaGeometry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sicca%2C+V">Vladmir Sicca</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianxiang Xia</a>, <a href="/search/cs?searchtype=author&query=F%C3%A9d%C3%A9rico%2C+M">Math茂s F茅d茅rico</a>, <a href="/search/cs?searchtype=author&query=Gorinski%2C+P+J">Philip John Gorinski</a>, <a href="/search/cs?searchtype=author&query=Frieder%2C+S">Simon Frieder</a>, <a href="/search/cs?searchtype=author&query=Jui%2C+S">Shangling Jui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11938v1-abstract-short" style="display: inline;"> We introduce a new symbolic solver for geometry, called Newclid, which is based on AlphaGeometry. Newclid contains a symbolic solver called DDARN (derived from DDAR-Newclid), which is a significant refactoring and upgrade of AlphaGeometry's DDAR symbolic solver by being more user-friendly - both for the end user as well as for a programmer wishing to extend the codebase. For the programmer, improv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11938v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11938v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11938v1-abstract-full" style="display: none;"> We introduce a new symbolic solver for geometry, called Newclid, which is based on AlphaGeometry. Newclid contains a symbolic solver called DDARN (derived from DDAR-Newclid), which is a significant refactoring and upgrade of AlphaGeometry's DDAR symbolic solver by being more user-friendly - both for the end user as well as for a programmer wishing to extend the codebase. For the programmer, improvements include a modularized codebase and new debugging and visualization tools. For the user, Newclid contains a new command line interface (CLI) that provides interfaces for agents to guide DDARN. DDARN is flexible with respect to its internal reasoning, which can be steered by agents. Further, we support input from GeoGebra to make Newclid accessible for educational contexts. Further, the scope of problems that Newclid can solve has been expanded to include the ability to have an improved understanding of metric geometry concepts (length, angle) and to use theorems such as the Pythagorean theorem in proofs. Bugs have been fixed, and reproducibility has been improved. Lastly, we re-evaluated the five remaining problems from the original AG-30 dataset that AlphaGeometry was not able to solve and contrasted them with the abilities of DDARN, running in breadth-first-search agentic mode (which corresponds to how DDARN runs by default), finding that DDARN solves an additional problem. We have open-sourced our code under: https://github.com/LMCRC/Newclid <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11938v1-abstract-full').style.display = 'none'; document.getElementById('2411.11938v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">51 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10821">arXiv:2411.10821</a> <span> [<a href="https://arxiv.org/pdf/2411.10821">pdf</a>, <a href="https://arxiv.org/format/2411.10821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> GeomCLIP: Contrastive Geometry-Text Pre-training for Molecules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Chao Cui</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Huaisheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Honavar%2C+V+G">Vasant G. Honavar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10821v1-abstract-short" style="display: inline;"> Pretraining molecular representations is crucial for drug and material discovery. Recent methods focus on learning representations from geometric structures, effectively capturing 3D position information. Yet, they overlook the rich information in biomedical texts, which detail molecules' properties and substructures. With this in mind, we set up a data collection effort for 200K pairs of ground-s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10821v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10821v1-abstract-full" style="display: none;"> Pretraining molecular representations is crucial for drug and material discovery. Recent methods focus on learning representations from geometric structures, effectively capturing 3D position information. Yet, they overlook the rich information in biomedical texts, which detail molecules' properties and substructures. With this in mind, we set up a data collection effort for 200K pairs of ground-state geometric structures and biomedical texts, resulting in a PubChem3D dataset. Based on this dataset, we propose the GeomCLIP framework to enhance for multi-modal representation learning from molecular structures and biomedical text. During pre-training, we design two types of tasks, i.e., multimodal representation alignment and unimodal denoising pretraining, to align the 3D geometric encoder with textual information and, at the same time, preserve its original representation power. Experimental results show the effectiveness of GeomCLIP in various tasks such as molecular property prediction, zero-shot text-molecule retrieval, and 3D molecule captioning. Our code and collected dataset are available at \url{https://github.com/xiaocui3737/GeomCLIP} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10821v1-abstract-full').style.display = 'none'; document.getElementById('2411.10821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BIBM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10484">arXiv:2411.10484</a> <span> [<a href="https://arxiv.org/pdf/2411.10484">pdf</a>, <a href="https://arxiv.org/format/2411.10484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> iFlow: An Interactive Max-Flow/Min-Cut Algorithms Visualizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+M">Muyang Ye</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianrui Xia</a>, <a href="/search/cs?searchtype=author&query=Zu%2C+T">Tianxin Zu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Kempe%2C+D">David Kempe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10484v1-abstract-short" style="display: inline;"> The Max-Flow/Min-Cut problem is a fundamental tool in graph theory, with applications in many domains, including data mining, image segmentation, transportation planning, and many types of assignment problems, in addition to being an essential building block for many other algorithms. The Ford-Fulkerson Algorithm for Max-Flow/Min-Cut and its variants are therefore commonly taught in undergraduate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10484v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10484v1-abstract-full" style="display: none;"> The Max-Flow/Min-Cut problem is a fundamental tool in graph theory, with applications in many domains, including data mining, image segmentation, transportation planning, and many types of assignment problems, in addition to being an essential building block for many other algorithms. The Ford-Fulkerson Algorithm for Max-Flow/Min-Cut and its variants are therefore commonly taught in undergraduate and beginning graduate algorithms classes. However, these algorithms -- and in particular the so-called residual graphs they utilize -- often pose significant challenges for students. To help students achieve a deeper understanding, we developed iFlow, an interactive visualization tool for the Ford-Fulkerson Algorithm and its variants. iFlow lets users design or import flow networks, and execute the algorithm by hand. In particular, the user can select an augmentation path and amount, and then update the residual graph. The user is given detailed feedback on mistakes, and can also have iFlow auto-complete each step, to use it as a demonstration tool while still in the initial learning stages. iFlow has been made publicly available and open-sourced. We deployed iFlow in an undergraduate algorithms class, and collected students' self-reported learning benefits via an optional survey. All respondents considered the tool at least somewhat useful and engaging, with most rating it either as useful/engaging or very useful/engaging. Students also generally reported a significant increase in understanding of the algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10484v1-abstract-full').style.display = 'none'; document.getElementById('2411.10484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by SIGCSE 2025 TS. Due to the page limit we can not include the appendix in the SIGCSE version. So we decide to include them on arXiv so that the SIGCSE version can point to the arXiv version. Since the final SIGCSE version is due by Nov. 17, it would be really helpful if this submission can go online as soon as possible. Thanks!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06444">arXiv:2411.06444</a> <span> [<a href="https://arxiv.org/pdf/2411.06444">pdf</a>, <a href="https://arxiv.org/format/2411.06444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SamRobNODDI: Q-Space Sampling-Augmented Continuous Representation Learning for Robust and Generalized NODDI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Taohui Xiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenxin Fan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+E">Enqing Dong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shanshan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06444v1-abstract-short" style="display: inline;"> Neurite Orientation Dispersion and Density Imaging (NODDI) microstructure estimation from diffusion magnetic resonance imaging (dMRI) is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods accelerate the speed of NODDI parameter estimation and improve the accuracy. However, most methods require the number and coordinates of gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06444v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06444v1-abstract-full" style="display: none;"> Neurite Orientation Dispersion and Density Imaging (NODDI) microstructure estimation from diffusion magnetic resonance imaging (dMRI) is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods accelerate the speed of NODDI parameter estimation and improve the accuracy. However, most methods require the number and coordinates of gradient directions during testing and training to remain strictly consistent, significantly limiting the generalization and robustness of these models in NODDI parameter estimation. In this paper, we propose a q-space sampling augmentation-based continuous representation learning framework (SamRobNODDI) to achieve robust and generalized NODDI. Specifically, a continuous representation learning method based on q-space sampling augmentation is introduced to fully explore the information between different gradient directions in q-space. Furthermore, we design a sampling consistency loss to constrain the outputs of different sampling schemes, ensuring that the outputs remain as consistent as possible, thereby further enhancing performance and robustness to varying q-space sampling schemes. SamRobNODDI is also a flexible framework that can be applied to different backbone networks. To validate the effectiveness of the proposed method, we compared it with 7 state-of-the-art methods across 18 different q-space sampling schemes, demonstrating that the proposed SamRobNODDI has better performance, robustness, generalization, and flexibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06444v1-abstract-full').style.display = 'none'; document.getElementById('2411.06444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04798">arXiv:2411.04798</a> <span> [<a href="https://arxiv.org/pdf/2411.04798">pdf</a>, <a href="https://arxiv.org/format/2411.04798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Orbit: A Framework for Designing and Evaluating Multi-objective Rankers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenyang Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tesi Xiao</a>, <a href="/search/cs?searchtype=author&query=Shavlovsky%2C+M">Michael Shavlovsky</a>, <a href="/search/cs?searchtype=author&query=K%C3%A4stner%2C+C">Christian K盲stner</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tongshuang Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04798v1-abstract-short" style="display: inline;"> Machine learning in production needs to balance multiple objectives: This is particularly evident in ranking or recommendation models, where conflicting objectives such as user engagement, satisfaction, diversity, and novelty must be considered at the same time. However, designing multi-objective rankers is inherently a dynamic wicked problem -- there is no single optimal solution, and the needs e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04798v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04798v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04798v1-abstract-full" style="display: none;"> Machine learning in production needs to balance multiple objectives: This is particularly evident in ranking or recommendation models, where conflicting objectives such as user engagement, satisfaction, diversity, and novelty must be considered at the same time. However, designing multi-objective rankers is inherently a dynamic wicked problem -- there is no single optimal solution, and the needs evolve over time. Effective design requires collaboration between cross-functional teams and careful analysis of a wide range of information. In this work, we introduce Orbit, a conceptual framework for Objective-centric Ranker Building and Iteration. The framework places objectives at the center of the design process, to serve as boundary objects for communication and guide practitioners for design and evaluation. We implement Orbit as an interactive system, which enables stakeholders to interact with objective spaces directly and supports real-time exploration and evaluation of design trade-offs. We evaluate Orbit through a user study involving twelve industry practitioners, showing that it supports efficient design space exploration, leads to more informed decision-making, and enhances awareness of the inherent trade-offs of multiple objectives. Orbit (1) opens up new opportunities of an objective-centric design process for any multi-objective ML models, as well as (2) sheds light on future designs that push practitioners to go beyond a narrow metric-centric or example-centric mindset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04798v1-abstract-full').style.display = 'none'; document.getElementById('2411.04798v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04549">arXiv:2411.04549</a> <span> [<a href="https://arxiv.org/pdf/2411.04549">pdf</a>, <a href="https://arxiv.org/format/2411.04549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Vision Language Models are In-Context Value Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y+J">Yecheng Jason Ma</a>, <a href="/search/cs?searchtype=author&query=Hejna%2C+J">Joey Hejna</a>, <a href="/search/cs?searchtype=author&query=Wahid%2C+A">Ayzaan Wahid</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chuyuan Fu</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+D">Dhruv Shah</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jacky Liang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Kirmani%2C+S">Sean Kirmani</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Peng Xu</a>, <a href="/search/cs?searchtype=author&query=Driess%2C+D">Danny Driess</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ted Xiao</a>, <a href="/search/cs?searchtype=author&query=Tompson%2C+J">Jonathan Tompson</a>, <a href="/search/cs?searchtype=author&query=Bastani%2C+O">Osbert Bastani</a>, <a href="/search/cs?searchtype=author&query=Jayaraman%2C+D">Dinesh Jayaraman</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingnan Zhang</a>, <a href="/search/cs?searchtype=author&query=Sadigh%2C+D">Dorsa Sadigh</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Fei Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04549v1-abstract-short" style="display: inline;"> Predicting temporal progress from visual trajectories is important for intelligent robots that can learn, adapt, and improve. However, learning such progress estimator, or temporal value function, across different tasks and domains requires both a large amount of diverse data and methods which can scale and generalize. To address these challenges, we present Generative Value Learning (\GVL), a uni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04549v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04549v1-abstract-full" style="display: none;"> Predicting temporal progress from visual trajectories is important for intelligent robots that can learn, adapt, and improve. However, learning such progress estimator, or temporal value function, across different tasks and domains requires both a large amount of diverse data and methods which can scale and generalize. To address these challenges, we present Generative Value Learning (\GVL), a universal value function estimator that leverages the world knowledge embedded in vision-language models (VLMs) to predict task progress. Naively asking a VLM to predict values for a video sequence performs poorly due to the strong temporal correlation between successive frames. Instead, GVL poses value estimation as a temporal ordering problem over shuffled video frames; this seemingly more challenging task encourages VLMs to more fully exploit their underlying semantic and temporal grounding capabilities to differentiate frames based on their perceived task progress, consequently producing significantly better value predictions. Without any robot or task specific training, GVL can in-context zero-shot and few-shot predict effective values for more than 300 distinct real-world tasks across diverse robot platforms, including challenging bimanual manipulation tasks. Furthermore, we demonstrate that GVL permits flexible multi-modal in-context learning via examples from heterogeneous tasks and embodiments, such as human videos. The generality of GVL enables various downstream applications pertinent to visuomotor policy learning, including dataset filtering, success detection, and advantage-weighted regression -- all without any model training or finetuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04549v1-abstract-full').style.display = 'none'; document.getElementById('2411.04549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website and demo: https://generative-value-learning.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04399">arXiv:2411.04399</a> <span> [<a href="https://arxiv.org/pdf/2411.04399">pdf</a>, <a href="https://arxiv.org/format/2411.04399">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProGraph: Temporally-alignable Probability Guided Graph Topological Modeling for 3D Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zehui Feng</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Genfan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04399v1-abstract-short" style="display: inline;"> Current 3D human motion reconstruction methods from monocular videos rely on features within the current reconstruction window, leading to distortion and deformations in the human structure under local occlusions or blurriness in video frames. To estimate realistic 3D human mesh sequences based on incomplete features, we propose Temporally-alignable Probability Guided Graph Topological Modeling fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04399v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04399v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04399v1-abstract-full" style="display: none;"> Current 3D human motion reconstruction methods from monocular videos rely on features within the current reconstruction window, leading to distortion and deformations in the human structure under local occlusions or blurriness in video frames. To estimate realistic 3D human mesh sequences based on incomplete features, we propose Temporally-alignable Probability Guided Graph Topological Modeling for 3D Human Reconstruction (ProGraph). For missing parts recovery, we exploit the explicit topological-aware probability distribution across the entire motion sequence. To restore the complete human, Graph Topological Modeling (GTM) learns the underlying topological structure, focusing on the relationships inherent in the individual parts. Next, to generate blurred motion parts, Temporal-alignable Probability Distribution (TPDist) utilizes the GTM to predict features based on distribution. This interactive mechanism facilitates motion consistency, allowing the restoration of human parts. Furthermore, Hierarchical Human Loss (HHLoss) constrains the probability distribution errors of inter-frame features during topological structure variation. Our Method achieves superior results than other SOTA methods in addressing occlusions and blurriness on 3DPW. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04399v1-abstract-full').style.display = 'none'; document.getElementById('2411.04399v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03409">arXiv:2411.03409</a> <span> [<a href="https://arxiv.org/pdf/2411.03409">pdf</a>, <a href="https://arxiv.org/format/2411.03409">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> STEER: Flexible Robotic Manipulation via Dense Language Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+L">Laura Smith</a>, <a href="/search/cs?searchtype=author&query=Irpan%2C+A">Alex Irpan</a>, <a href="/search/cs?searchtype=author&query=Arenas%2C+M+G">Montserrat Gonzalez Arenas</a>, <a href="/search/cs?searchtype=author&query=Kirmani%2C+S">Sean Kirmani</a>, <a href="/search/cs?searchtype=author&query=Kalashnikov%2C+D">Dmitry Kalashnikov</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+D">Dhruv Shah</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ted Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03409v1-abstract-short" style="display: inline;"> The complexity of the real world demands robotic systems that can intelligently adapt to unseen situations. We present STEER, a robot learning framework that bridges high-level, commonsense reasoning with precise, flexible low-level control. Our approach translates complex situational awareness into actionable low-level behavior through training language-grounded policies with dense annotation. By… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03409v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03409v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03409v1-abstract-full" style="display: none;"> The complexity of the real world demands robotic systems that can intelligently adapt to unseen situations. We present STEER, a robot learning framework that bridges high-level, commonsense reasoning with precise, flexible low-level control. Our approach translates complex situational awareness into actionable low-level behavior through training language-grounded policies with dense annotation. By structuring policy training around fundamental, modular manipulation skills expressed in natural language, STEER exposes an expressive interface for humans or Vision-Language Models (VLMs) to intelligently orchestrate the robot's behavior by reasoning about the task and context. Our experiments demonstrate the skills learned via STEER can be combined to synthesize novel behaviors to adapt to new situations or perform completely new tasks without additional data collection or training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03409v1-abstract-full').style.display = 'none'; document.getElementById('2411.03409v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://lauramsmith.github.io/steer/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03042">arXiv:2411.03042</a> <span> [<a href="https://arxiv.org/pdf/2411.03042">pdf</a>, <a href="https://arxiv.org/format/2411.03042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Predictor-Corrector Enhanced Transformers with Exponential Moving Average Coefficient Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiahao Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingyan Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junliang Guo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03042v1-abstract-short" style="display: inline;"> Residual networks, as discrete approximations of Ordinary Differential Equations (ODEs), have inspired significant advancements in neural network design, including multistep methods, high-order methods, and multi-particle dynamical systems. The precision of the solution to ODEs significantly affects parameter optimization, thereby impacting model performance. In this work, we present a series of a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03042v1-abstract-full" style="display: none;"> Residual networks, as discrete approximations of Ordinary Differential Equations (ODEs), have inspired significant advancements in neural network design, including multistep methods, high-order methods, and multi-particle dynamical systems. The precision of the solution to ODEs significantly affects parameter optimization, thereby impacting model performance. In this work, we present a series of advanced explorations of Transformer architecture design to minimize the error compared to the true ``solution.'' First, we introduce a predictor-corrector learning framework to minimize truncation errors, which consists of a high-order predictor and a multistep corrector. Second, we propose an exponential moving average-based coefficient learning method to strengthen our higher-order predictor. Extensive experiments on large-scale machine translation, abstractive summarization, language modeling, and natural language understanding benchmarks demonstrate the superiority of our approach. On the WMT'14 English-German and English-French tasks, our model achieved BLEU scores of 30.95 and 44.27, respectively. Furthermore, on the OPUS multilingual machine translation task, our model surpasses a robust 3.8B DeepNet by an average of 2.9 SacreBLEU, using only 1/3 parameters. Notably, it also beats LLama models by 5.7 accuracy points on the LM Harness Evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03042v1-abstract-full').style.display = 'none'; document.getElementById('2411.03042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02704">arXiv:2411.02704</a> <span> [<a href="https://arxiv.org/pdf/2411.02704">pdf</a>, <a href="https://arxiv.org/format/2411.02704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RT-Affordance: Affordances are Versatile Intermediate Representations for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nasiriany%2C+S">Soroush Nasiriany</a>, <a href="/search/cs?searchtype=author&query=Kirmani%2C+S">Sean Kirmani</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianli Ding</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+L">Laura Smith</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&query=Driess%2C+D">Danny Driess</a>, <a href="/search/cs?searchtype=author&query=Sadigh%2C+D">Dorsa Sadigh</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ted Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02704v1-abstract-short" style="display: inline;"> We explore how intermediate policy representations can facilitate generalization by providing guidance on how to perform manipulation tasks. Existing representations such as language, goal images, and trajectory sketches have been shown to be helpful, but these representations either do not provide enough context or provide over-specified context that yields less robust policies. We propose condit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02704v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02704v1-abstract-full" style="display: none;"> We explore how intermediate policy representations can facilitate generalization by providing guidance on how to perform manipulation tasks. Existing representations such as language, goal images, and trajectory sketches have been shown to be helpful, but these representations either do not provide enough context or provide over-specified context that yields less robust policies. We propose conditioning policies on affordances, which capture the pose of the robot at key stages of the task. Affordances offer expressive yet lightweight abstractions, are easy for users to specify, and facilitate efficient learning by transferring knowledge from large internet datasets. Our method, RT-Affordance, is a hierarchical model that first proposes an affordance plan given the task language, and then conditions the policy on this affordance plan to perform manipulation. Our model can flexibly bridge heterogeneous sources of supervision including large web datasets and robot trajectories. We additionally train our model on cheap-to-collect in-domain affordance images, allowing us to learn new tasks without collecting any additional costly robot trajectories. We show on a diverse set of novel tasks how RT-Affordance exceeds the performance of existing methods by over 50%, and we empirically demonstrate that affordances are robust to novel settings. Videos available at https://snasiriany.me/rt-affordance <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02704v1-abstract-full').style.display = 'none'; document.getElementById('2411.02704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01438">arXiv:2411.01438</a> <span> [<a href="https://arxiv.org/pdf/2411.01438">pdf</a>, <a href="https://arxiv.org/format/2411.01438">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SkyServe: Serving AI Models across Regions and Clouds with Spot Instances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhanghao Wu</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+W">Wei-Lin Chiang</a>, <a href="/search/cs?searchtype=author&query=Griggs%2C+T">Tyler Griggs</a>, <a href="/search/cs?searchtype=author&query=Bhardwaj%2C+R">Romil Bhardwaj</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shenker%2C+S">Scott Shenker</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01438v1-abstract-short" style="display: inline;"> Recent years have witnessed an explosive growth of AI models. The high cost of hosting AI services on GPUs and their demanding service requirements, make it timely and challenging to lower service costs and guarantee service quality. While spot instances have long been offered with a large discount, spot preemptions have discouraged users from using them to host model replicas when serving AI mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01438v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01438v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01438v1-abstract-full" style="display: none;"> Recent years have witnessed an explosive growth of AI models. The high cost of hosting AI services on GPUs and their demanding service requirements, make it timely and challenging to lower service costs and guarantee service quality. While spot instances have long been offered with a large discount, spot preemptions have discouraged users from using them to host model replicas when serving AI models. To address this, we introduce SkyServe, a system that efficiently serves AI models over a mixture of spot and on-demand replicas across regions and clouds. SkyServe intelligently spreads spot replicas across different failure domains (e.g., regions or clouds) to improve availability and reduce correlated preemptions, overprovisions cheap spot replicas than required as a safeguard against possible preemptions, and dynamically falls back to on-demand replicas when spot replicas become unavailable. We compare SkyServe with both research and production systems on real AI workloads: SkyServe reduces cost by up to 44% while achieving high resource availability compared to using on-demand replicas. Additionally, SkyServe improves P50, P90, and P99 latency by up to 2.6x, 3.1x, 2.7x compared to other research and production systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01438v1-abstract-full').style.display = 'none'; document.getElementById('2411.01438v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14952">arXiv:2410.14952</a> <span> [<a href="https://arxiv.org/pdf/2410.14952">pdf</a>, <a href="https://arxiv.org/format/2410.14952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> A Fast AI Surrogate for Coastal Ocean Circulation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zelin Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yupu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ondina%2C+J+M+G">Jose Maria Gonzalez Ondina</a>, <a href="/search/cs?searchtype=author&query=Olabarrieta%2C+M">Maitane Olabarrieta</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tingsong Xiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wenchong He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zibo Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shigang Chen</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+K">Kaleb Smith</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14952v1-abstract-short" style="display: inline;"> Nearly 900 million people live in low-lying coastal zones around the world and bear the brunt of impacts from more frequent and severe hurricanes and storm surges. Oceanographers simulate ocean current circulation along the coasts to develop early warning systems that save lives and prevent loss and damage to property from coastal hazards. Traditionally, such simulations are conducted using coasta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14952v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14952v1-abstract-full" style="display: none;"> Nearly 900 million people live in low-lying coastal zones around the world and bear the brunt of impacts from more frequent and severe hurricanes and storm surges. Oceanographers simulate ocean current circulation along the coasts to develop early warning systems that save lives and prevent loss and damage to property from coastal hazards. Traditionally, such simulations are conducted using coastal ocean circulation models such as the Regional Ocean Modeling System (ROMS), which usually runs on an HPC cluster with multiple CPU cores. However, the process is time-consuming and energy expensive. While coarse-grained ROMS simulations offer faster alternatives, they sacrifice detail and accuracy, particularly in complex coastal environments. Recent advances in deep learning and GPU architecture have enabled the development of faster AI (neural network) surrogates. This paper introduces an AI surrogate based on a 4D Swin Transformer to simulate coastal tidal wave propagation in an estuary for both hindcast and forecast (up to 12 days). Our approach not only accelerates simulations but also incorporates a physics-based constraint to detect and correct inaccurate results, ensuring reliability while minimizing manual intervention. We develop a fully GPU-accelerated workflow, optimizing the model training and inference pipeline on NVIDIA DGX-2 A100 GPUs. Our experiments demonstrate that our AI surrogate reduces the time cost of 12-day forecasting of traditional ROMS simulations from 9,908 seconds (on 512 CPU cores) to 22 seconds (on one A100 GPU), achieving over 450$\times$ speedup while maintaining high-quality simulation results. This work contributes to oceanographic modeling by offering a fast, accurate, and physically consistent alternative to traditional simulation models, particularly for real-time forecasting in rapid disaster response. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14952v1-abstract-full').style.display = 'none'; document.getElementById('2410.14952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14464">arXiv:2410.14464</a> <span> [<a href="https://arxiv.org/pdf/2410.14464">pdf</a>, <a href="https://arxiv.org/format/2410.14464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Electrocardiogram-Language Model for Few-Shot Question Answering with Meta Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jialu Tang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tong Xia</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuan Lu</a>, <a href="/search/cs?searchtype=author&query=Mascolo%2C+C">Cecilia Mascolo</a>, <a href="/search/cs?searchtype=author&query=Saeed%2C+A">Aaqib Saeed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14464v1-abstract-short" style="display: inline;"> Electrocardiogram (ECG) interpretation requires specialized expertise, often involving synthesizing insights from ECG signals with complex clinical queries posed in natural language. The scarcity of labeled ECG data coupled with the diverse nature of clinical inquiries presents a significant challenge for developing robust and adaptable ECG diagnostic systems. This work introduces a novel multimod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14464v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14464v1-abstract-full" style="display: none;"> Electrocardiogram (ECG) interpretation requires specialized expertise, often involving synthesizing insights from ECG signals with complex clinical queries posed in natural language. The scarcity of labeled ECG data coupled with the diverse nature of clinical inquiries presents a significant challenge for developing robust and adaptable ECG diagnostic systems. This work introduces a novel multimodal meta-learning method for few-shot ECG question answering, addressing the challenge of limited labeled data while leveraging the rich knowledge encoded within large language models (LLMs). Our LLM-agnostic approach integrates a pre-trained ECG encoder with a frozen LLM (e.g., LLaMA and Gemma) via a trainable fusion module, enabling the language model to reason about ECG data and generate clinically meaningful answers. Extensive experiments demonstrate superior generalization to unseen diagnostic tasks compared to supervised baselines, achieving notable performance even with limited ECG leads. For instance, in a 5-way 5-shot setting, our method using LLaMA-3.1-8B achieves accuracy of 84.6%, 77.3%, and 69.6% on single verify, choose and query question types, respectively. These results highlight the potential of our method to enhance clinical ECG interpretation by combining signal processing with the nuanced language understanding capabilities of LLMs, particularly in data-constrained scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14464v1-abstract-full').style.display = 'none'; document.getElementById('2410.14464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10672">arXiv:2410.10672</a> <span> [<a href="https://arxiv.org/pdf/2410.10672">pdf</a>, <a href="https://arxiv.org/format/2410.10672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Evaluation via Matrix Nuclear-Norm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yahan Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tingyu Xia</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuan Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10672v1-abstract-short" style="display: inline;"> As large language models (LLMs) continue to evolve, efficient evaluation metrics are vital for assessing their ability to compress information and reduce redundancy. While traditional metrics like Matrix Entropy offer valuable insights, they are computationally intensive for large-scale models due to their $ O(n^3) $ time complexity with Singular Value Decomposition (SVD). To mitigate this issue… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10672v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10672v1-abstract-full" style="display: none;"> As large language models (LLMs) continue to evolve, efficient evaluation metrics are vital for assessing their ability to compress information and reduce redundancy. While traditional metrics like Matrix Entropy offer valuable insights, they are computationally intensive for large-scale models due to their $ O(n^3) $ time complexity with Singular Value Decomposition (SVD). To mitigate this issue, we introduce the Matrix Nuclear-Norm, which not only serves as a metric to quantify the data compression proficiency of LLM but also provides a convex approximation of matrix rank to capture both predictive discriminability and diversity. By employing the $ L_{1,2}\text{-norm} $ to further approximate the nuclear norm, we can effectively assess the model's information compression capabilities. This approach reduces the time complexity to $ O(n^2) $ and eliminates the need for SVD computation. Consequently, the Matrix Nuclear-Norm achieves speeds 8 to 24 times faster than Matrix Entropy for the CEREBRAS-GPT model as sizes increase from 111M to 6.7B. This performance gap becomes more pronounced with larger models, as validated in tests with other models like Pythia. Additionally, evaluations on benchmarks and model responses confirm that our proposed Matrix Nuclear-Norm is a reliable, scalable, and efficient tool for assessing LLMs' performance, striking a balance between accuracy and computational efficiency. The code is available at https://github.com/MLGroupJLU/MatrixNuclearNorm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10672v1-abstract-full').style.display = 'none'; document.getElementById('2410.10672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10155">arXiv:2410.10155</a> <span> [<a href="https://arxiv.org/pdf/2410.10155">pdf</a>, <a href="https://arxiv.org/format/2410.10155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Tracing Human Stress from Physiological Signals using UWB Radar </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jia Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+P">Pin Lv</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chao Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10155v1-abstract-short" style="display: inline;"> Stress tracing is an important research domain that supports many applications, such as health care and stress management; and its closest related works are derived from stress detection. However, these existing works cannot well address two important challenges facing stress detection. First, most of these studies involve asking users to wear physiological sensors to detect their stress states, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10155v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10155v1-abstract-full" style="display: none;"> Stress tracing is an important research domain that supports many applications, such as health care and stress management; and its closest related works are derived from stress detection. However, these existing works cannot well address two important challenges facing stress detection. First, most of these studies involve asking users to wear physiological sensors to detect their stress states, which has a negative impact on the user experience. Second, these studies have failed to effectively utilize multimodal physiological signals, which results in less satisfactory detection results. This paper formally defines the stress tracing problem, which emphasizes the continuous detection of human stress states. A novel deep stress tracing method, named DST, is presented. Note that DST proposes tracing human stress based on physiological signals collected by a noncontact ultrawideband radar, which is more friendly to users when collecting their physiological signals. In DST, a signal extraction module is carefully designed at first to robustly extract multimodal physiological signals from the raw RF data of the radar, even in the presence of body movement. Afterward, a multimodal fusion module is proposed in DST to ensure that the extracted multimodal physiological signals can be effectively fused and utilized. Extensive experiments are conducted on three real-world datasets, including one self-collected dataset and two publicity datasets. Experimental results show that the proposed DST method significantly outperforms all the baselines in terms of tracing human stress states. On average, DST averagely provides a 6.31% increase in detection accuracy on all datasets, compared with the best baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10155v1-abstract-full').style.display = 'none'; document.getElementById('2410.10155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10093">arXiv:2410.10093</a> <span> [<a href="https://arxiv.org/pdf/2410.10093">pdf</a>, <a href="https://arxiv.org/format/2410.10093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How to Leverage Demonstration Data in Alignment for Large Language Model? A Self-Imitation Learning Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxiao Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yige Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Huaisheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+C">Chao Cui</a>, <a href="/search/cs?searchtype=author&query=Honavar%2C+V+G">Vasant G Honavar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10093v1-abstract-short" style="display: inline;"> This paper introduces a novel generalized self-imitation learning ($\textbf{GSIL}$) framework, which effectively and efficiently aligns large language models with offline demonstration data. We develop $\textbf{GSIL}$ by deriving a surrogate objective of imitation learning with density ratio estimates, facilitating the use of self-generated data and optimizing the imitation learning objective with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10093v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10093v1-abstract-full" style="display: none;"> This paper introduces a novel generalized self-imitation learning ($\textbf{GSIL}$) framework, which effectively and efficiently aligns large language models with offline demonstration data. We develop $\textbf{GSIL}$ by deriving a surrogate objective of imitation learning with density ratio estimates, facilitating the use of self-generated data and optimizing the imitation learning objective with simple classification losses. $\textbf{GSIL}$ eliminates the need for complex adversarial training in standard imitation learning, achieving lightweight and efficient fine-tuning for large language models. In addition, $\textbf{GSIL}$ encompasses a family of offline losses parameterized by a general class of convex functions for density ratio estimation and enables a unified view for alignment with demonstration data. Extensive experiments show that $\textbf{GSIL}$ consistently and significantly outperforms baselines in many challenging benchmarks, such as coding (HuamnEval), mathematical reasoning (GSM8K) and instruction-following benchmark (MT-Bench). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10093v1-abstract-full').style.display = 'none'; document.getElementById('2410.10093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP 2024 Main</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09398">arXiv:2410.09398</a> <span> [<a href="https://arxiv.org/pdf/2410.09398">pdf</a>, <a href="https://arxiv.org/format/2410.09398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MITA: Bridging the Gap between Model and Data for Test-time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yige Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bingbing Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Liang Hou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fei Sun</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09398v1-abstract-short" style="display: inline;"> Test-Time Adaptation (TTA) has emerged as a promising paradigm for enhancing the generalizability of models. However, existing mainstream TTA methods, predominantly operating at batch level, often exhibit suboptimal performance in complex real-world scenarios, particularly when confronting outliers or mixed distributions. This phenomenon stems from a pronounced over-reliance on statistical pattern… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09398v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09398v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09398v1-abstract-full" style="display: none;"> Test-Time Adaptation (TTA) has emerged as a promising paradigm for enhancing the generalizability of models. However, existing mainstream TTA methods, predominantly operating at batch level, often exhibit suboptimal performance in complex real-world scenarios, particularly when confronting outliers or mixed distributions. This phenomenon stems from a pronounced over-reliance on statistical patterns over the distinct characteristics of individual instances, resulting in a divergence between the distribution captured by the model and data characteristics. To address this challenge, we propose Meet-In-The-Middle based Test-Time Adaptation ($\textbf{MITA}$), which introduces energy-based optimization to encourage mutual adaptation of the model and data from opposing directions, thereby meeting in the middle. MITA pioneers a significant departure from traditional approaches that focus solely on aligning the model to the data, facilitating a more effective bridging of the gap between model's distribution and data characteristics. Comprehensive experiments with MITA across three distinct scenarios (Outlier, Mixture, and Pure) demonstrate its superior performance over SOTA methods, highlighting its potential to significantly enhance generalizability in practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09398v1-abstract-full').style.display = 'none'; document.getElementById('2410.09398v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09335">arXiv:2410.09335</a> <span> [<a href="https://arxiv.org/pdf/2410.09335">pdf</a>, <a href="https://arxiv.org/format/2410.09335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Data Selection at Scale: Random Selection is Almost All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tingyu Xia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuan Wu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09335v1-abstract-short" style="display: inline;"> Supervised fine-tuning (SFT) is crucial for aligning Large Language Models (LLMs) with human instructions. The primary goal during SFT is to select a small yet representative subset of training data from the larger pool, such that fine-tuning with this subset achieves results comparable to or even exceeding those obtained using the entire dataset. However, most existing data selection techniques a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09335v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09335v1-abstract-full" style="display: none;"> Supervised fine-tuning (SFT) is crucial for aligning Large Language Models (LLMs) with human instructions. The primary goal during SFT is to select a small yet representative subset of training data from the larger pool, such that fine-tuning with this subset achieves results comparable to or even exceeding those obtained using the entire dataset. However, most existing data selection techniques are designed for small-scale data pools, which fail to meet the demands of real-world SFT scenarios. In this paper, we replicated several self-scoring methods those that do not rely on external model assistance on two million scale datasets, and found that nearly all methods struggled to significantly outperform random selection when dealing with such large-scale data pools. Moreover, our comparisons suggest that, during SFT, diversity in data selection is more critical than simply focusing on high quality data. We also analyzed the limitations of several current approaches, explaining why they perform poorly on large-scale datasets and why they are unsuitable for such contexts. Finally, we found that filtering data by token length offers a stable and efficient method for improving results. This approach, particularly when training on long text data, proves highly beneficial for relatively weaker base models, such as Llama3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09335v1-abstract-full').style.display = 'none'; document.getElementById('2410.09335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08781">arXiv:2410.08781</a> <span> [<a href="https://arxiv.org/pdf/2410.08781">pdf</a>, <a href="https://arxiv.org/format/2410.08781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VideoSAM: Open-World Video Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pinxue Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zixu Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianxiong Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chongruo Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tianjun Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08781v1-abstract-short" style="display: inline;"> Video segmentation is essential for advancing robotics and autonomous driving, particularly in open-world settings where continuous perception and object association across video frames are critical. While the Segment Anything Model (SAM) has excelled in static image segmentation, extending its capabilities to video segmentation poses significant challenges. We tackle two major hurdles: a) SAM's e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08781v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08781v1-abstract-full" style="display: none;"> Video segmentation is essential for advancing robotics and autonomous driving, particularly in open-world settings where continuous perception and object association across video frames are critical. While the Segment Anything Model (SAM) has excelled in static image segmentation, extending its capabilities to video segmentation poses significant challenges. We tackle two major hurdles: a) SAM's embedding limitations in associating objects across frames, and b) granularity inconsistencies in object segmentation. To this end, we introduce VideoSAM, an end-to-end framework designed to address these challenges by improving object tracking and segmentation consistency in dynamic environments. VideoSAM integrates an agglomerated backbone, RADIO, enabling object association through similarity metrics and introduces Cycle-ack-Pairs Propagation with a memory mechanism for stable object tracking. Additionally, we incorporate an autoregressive object-token mechanism within the SAM decoder to maintain consistent granularity across frames. Our method is extensively evaluated on the UVO and BURST benchmarks, and robotic videos from RoboTAP, demonstrating its effectiveness and robustness in real-world scenarios. All codes will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08781v1-abstract-full').style.display = 'none'; document.getElementById('2410.08781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08316">arXiv:2410.08316</a> <span> [<a href="https://arxiv.org/pdf/2410.08316">pdf</a>, <a href="https://arxiv.org/format/2410.08316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> HyperDPO: Hypernetwork-based Multi-Objective Fine-Tuning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yinuo Ren</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tesi Xiao</a>, <a href="/search/cs?searchtype=author&query=Shavlovsky%2C+M">Michael Shavlovsky</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+L">Lexing Ying</a>, <a href="/search/cs?searchtype=author&query=Rahmanian%2C+H">Holakou Rahmanian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08316v1-abstract-short" style="display: inline;"> In LLM alignment and many other ML applications, one often faces the Multi-Objective Fine-Tuning (MOFT) problem, i.e. fine-tuning an existing model with datasets labeled w.r.t. different objectives simultaneously. To address the challenge, we propose the HyperDPO framework, a hypernetwork-based approach that extends the Direct Preference Optimization (DPO) technique, originally developed for effic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08316v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08316v1-abstract-full" style="display: none;"> In LLM alignment and many other ML applications, one often faces the Multi-Objective Fine-Tuning (MOFT) problem, i.e. fine-tuning an existing model with datasets labeled w.r.t. different objectives simultaneously. To address the challenge, we propose the HyperDPO framework, a hypernetwork-based approach that extends the Direct Preference Optimization (DPO) technique, originally developed for efficient LLM alignment with preference data, to accommodate the MOFT settings. By substituting the Bradley-Terry-Luce model in DPO with the Plackett-Luce model, our framework is capable of handling a wide range of MOFT tasks that involve listwise ranking datasets. Compared with previous approaches, HyperDPO enjoys an efficient one-shot training process for profiling the Pareto front of auxiliary objectives, and offers flexible post-training control over trade-offs. Additionally, we propose a novel Hyper Prompt Tuning design, that conveys continuous weight across objectives to transformer-based models without altering their architecture. We demonstrate the effectiveness and efficiency of the HyperDPO framework through its applications to various tasks, including Learning-to-Rank (LTR) and LLM alignment, highlighting its viability for large-scale ML deployments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08316v1-abstract-full').style.display = 'none'; document.getElementById('2410.08316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05361">arXiv:2410.05361</a> <span> [<a href="https://arxiv.org/pdf/2410.05361">pdf</a>, <a href="https://arxiv.org/format/2410.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> RespLLM: Unifying Audio and Text with Multimodal LLMs for Generalized Respiratory Health Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tong Xia</a>, <a href="/search/cs?searchtype=author&query=Saeed%2C+A">Aaqib Saeed</a>, <a href="/search/cs?searchtype=author&query=Mascolo%2C+C">Cecilia Mascolo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05361v1-abstract-short" style="display: inline;"> The high incidence and mortality rates associated with respiratory diseases underscores the importance of early screening. Machine learning models can automate clinical consultations and auscultation, offering vital support in this area. However, the data involved, spanning demographics, medical history, symptoms, and respiratory audio, are heterogeneous and complex. Existing approaches are insuff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05361v1-abstract-full" style="display: none;"> The high incidence and mortality rates associated with respiratory diseases underscores the importance of early screening. Machine learning models can automate clinical consultations and auscultation, offering vital support in this area. However, the data involved, spanning demographics, medical history, symptoms, and respiratory audio, are heterogeneous and complex. Existing approaches are insufficient and lack generalizability, as they typically rely on limited training data, basic fusion techniques, and task-specific models. In this paper, we propose RespLLM, a novel multimodal large language model (LLM) framework that unifies text and audio representations for respiratory health prediction. RespLLM leverages the extensive prior knowledge of pretrained LLMs and enables effective audio-text fusion through cross-modal attentions. Instruction tuning is employed to integrate diverse data from multiple sources, ensuring generalizability and versatility of the model. Experiments on five real-world datasets demonstrate that RespLLM outperforms leading baselines by an average of 4.6% on trained tasks, 7.9% on unseen datasets, and facilitates zero-shot predictions for new tasks. Our work lays the foundation for multimodal models that can perceive, listen to, and understand heterogeneous data, paving the way for scalable respiratory health diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05361v1-abstract-full').style.display = 'none'; document.getElementById('2410.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04734">arXiv:2410.04734</a> <span> [<a href="https://arxiv.org/pdf/2410.04734">pdf</a>, <a href="https://arxiv.org/format/2410.04734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TLDR: Token-Level Detective Reward Model for Large Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+D">Deqing Fu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wang Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengchuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+G">Guan Pang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+R">Robin Jia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lawrence Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04734v1-abstract-short" style="display: inline;"> Although reward models have been successful in improving multimodal large language models, the reward models themselves remain brutal and contain minimal information. Notably, existing reward models only mimic human annotations by assigning only one binary feedback to any text, no matter how long the text is. In the realm of multimodal language models, where models are required to process both ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04734v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04734v1-abstract-full" style="display: none;"> Although reward models have been successful in improving multimodal large language models, the reward models themselves remain brutal and contain minimal information. Notably, existing reward models only mimic human annotations by assigning only one binary feedback to any text, no matter how long the text is. In the realm of multimodal language models, where models are required to process both images and texts, a naive reward model may learn implicit biases toward texts and become less grounded in images. In this paper, we propose a $\textbf{T}$oken-$\textbf{L}$evel $\textbf{D}$etective $\textbf{R}$eward Model ($\textbf{TLDR}$) to provide fine-grained annotations to each text token. We first introduce a perturbation-based method to generate synthetic hard negatives and their token-level labels to train TLDR models. Then we show the rich usefulness of TLDR models both in assisting off-the-shelf models to self-correct their generations, and in serving as a hallucination evaluation tool. Finally, we show that TLDR models can significantly speed up human annotation by 3 times to acquire a broader range of high-quality vision language data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04734v1-abstract-full').style.display = 'none'; document.getElementById('2410.04734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work done at Meta</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04727">arXiv:2410.04727</a> <span> [<a href="https://arxiv.org/pdf/2410.04727">pdf</a>, <a href="https://arxiv.org/format/2410.04727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Forgetting Curve: A Reliable Method for Evaluating Memorization Capability for Long-context Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runsong Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Pengcheng Huang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chunyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingang Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04727v1-abstract-short" style="display: inline;"> Numerous recent works target to extend effective context length for language models and various methods, tasks and benchmarks exist to measure model's effective memorization length. However, through thorough investigations, we find limitations for currently existing evaluations on model's memorization capability. We provide an extensive survey for limitations in this work and propose a new method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04727v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04727v1-abstract-full" style="display: none;"> Numerous recent works target to extend effective context length for language models and various methods, tasks and benchmarks exist to measure model's effective memorization length. However, through thorough investigations, we find limitations for currently existing evaluations on model's memorization capability. We provide an extensive survey for limitations in this work and propose a new method called forgetting curve to measure the memorization capability of long-context models. We show that forgetting curve has the advantage of being robust to the tested corpus and the experimental settings, of not relying on prompts and can be applied to any model size. We apply our forgetting curve to a large variety of models involving both transformer and RNN/SSM based architectures. Our measurement provides empirical evidence for the effectiveness of transformer extension techniques while raises questions for the effective length of RNN/SSM based models. We also examine the difference between our measurement and existing benchmarks as well as popular metrics for various models. Our code and results can be found at https://github.com/1azybug/ForgettingCurve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04727v1-abstract-full').style.display = 'none'; document.getElementById('2410.04727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04503">arXiv:2410.04503</a> <span> [<a href="https://arxiv.org/pdf/2410.04503">pdf</a>, <a href="https://arxiv.org/format/2410.04503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LRHP: Learning Representations for Human Preferences via Preference Pairs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yang Gan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yifu Huo</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yongyu Mu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qiaozhi He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Murun Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongran Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04503v1-abstract-short" style="display: inline;"> To improve human-preference alignment training, current research has developed numerous preference datasets consisting of preference pairs labeled as "preferred" or "dispreferred". These preference pairs are typically used to encode human preferences into a single numerical value through reward modeling, which acts as a reward signal during reinforcement learning from human feedback (RLHF). Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04503v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04503v1-abstract-full" style="display: none;"> To improve human-preference alignment training, current research has developed numerous preference datasets consisting of preference pairs labeled as "preferred" or "dispreferred". These preference pairs are typically used to encode human preferences into a single numerical value through reward modeling, which acts as a reward signal during reinforcement learning from human feedback (RLHF). However, representing these human preferences as a numerical value complicates the analysis of these preferences and restricts their broader applications other than RLHF. In contrast, in this work, we introduce a preference representation learning task that aims to construct a richer and more structured representation of human preferences. We further develop a more generalizable framework, Learning Representations for Human Preferences via preference pairs (namely LRHP), which extends beyond traditional reward modeling to tackle this task. We verify the utility of preference representations in two downstream tasks: preference data selection and preference margin prediction. Building upon the human preferences in representations, we achieve strong performance in both tasks, significantly outperforming baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04503v1-abstract-full').style.display = 'none'; document.getElementById('2410.04503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01739">arXiv:2410.01739</a> <span> [<a href="https://arxiv.org/pdf/2410.01739">pdf</a>, <a href="https://arxiv.org/format/2410.01739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mimicking Human Intuition: Cognitive Belief-Driven Q-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xingrui Gu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+G">Guanren Qiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chuyi Jiang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianqing Xia</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hangyu Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01739v2-abstract-short" style="display: inline;"> Reinforcement learning encounters challenges in various environments related to robustness and explainability. Traditional Q-learning algorithms cannot effectively make decisions and utilize the historical learning experience. To overcome these limitations, we propose Cognitive Belief-Driven Q-Learning (CBDQ), which integrates subjective belief modeling into the Q-learning framework, enhancing dec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01739v2-abstract-full').style.display = 'inline'; document.getElementById('2410.01739v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01739v2-abstract-full" style="display: none;"> Reinforcement learning encounters challenges in various environments related to robustness and explainability. Traditional Q-learning algorithms cannot effectively make decisions and utilize the historical learning experience. To overcome these limitations, we propose Cognitive Belief-Driven Q-Learning (CBDQ), which integrates subjective belief modeling into the Q-learning framework, enhancing decision-making accuracy by endowing agents with human-like learning and reasoning capabilities. Drawing inspiration from cognitive science, our method maintains a subjective belief distribution over the expectation of actions, leveraging a cluster-based subjective belief model that enables agents to reason about the potential probability associated with each decision. CBDQ effectively mitigates overestimated phenomena and optimizes decision-making policies by integrating historical experiences with current contextual information, mimicking the dynamics of human decision-making. We evaluate the proposed method on discrete control benchmark tasks in various complicate environments. The results demonstrate that CBDQ exhibits stronger adaptability, robustness, and human-like characteristics in handling these environments, outperforming other baselines. We hope this work will give researchers a fresh perspective on understanding and explaining Q-learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01739v2-abstract-full').style.display = 'none'; document.getElementById('2410.01739v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19071">arXiv:2409.19071</a> <span> [<a href="https://arxiv.org/pdf/2409.19071">pdf</a>, <a href="https://arxiv.org/format/2409.19071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Analog fast Fourier transforms for scalable and efficient signal processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T+P">T. Patrick Xiao</a>, <a href="/search/cs?searchtype=author&query=Feinberg%2C+B">Ben Feinberg</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+D+K">David K. Richardson</a>, <a href="/search/cs?searchtype=author&query=Cannon%2C+M">Matthew Cannon</a>, <a href="/search/cs?searchtype=author&query=Medu%2C+H">Harsha Medu</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+V">Vineet Agrawal</a>, <a href="/search/cs?searchtype=author&query=Marinella%2C+M+J">Matthew J. Marinella</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Sapan Agarwal</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+C+H">Christopher H. Bennett</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19071v1-abstract-short" style="display: inline;"> Edge devices are being deployed at increasing volumes to sense and act on information from the physical world. The discrete Fourier transform (DFT) is often necessary to make this sensed data suitable for further processing $\unicode{x2013}$ such as by artificial intelligence (AI) algorithms $\unicode{x2013}$ and for transmission over communication networks. Analog in-memory computing has been sho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19071v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19071v1-abstract-full" style="display: none;"> Edge devices are being deployed at increasing volumes to sense and act on information from the physical world. The discrete Fourier transform (DFT) is often necessary to make this sensed data suitable for further processing $\unicode{x2013}$ such as by artificial intelligence (AI) algorithms $\unicode{x2013}$ and for transmission over communication networks. Analog in-memory computing has been shown to be a fast and energy-efficient solution for processing edge AI workloads, but not for Fourier transforms. This is because of the existence of the fast Fourier transform (FFT) algorithm, which enormously reduces the complexity of the DFT but has so far belonged only to digital processors. Here, we show that the FFT can be mapped to analog in-memory computing systems, enabling them to efficiently scale to arbitrarily large Fourier transforms without requiring large sizes or large numbers of non-volatile memory arrays. We experimentally demonstrate analog FFTs on 1D audio and 2D image signals, using a large-scale charge-trapping memory array with precisely tunable, low-conductance analog states. The scalability of both the new analog FFT approach and the charge-trapping memory device is leveraged to compute a 65,536-point analog DFT, a scale that is otherwise inaccessible by analog systems and which is $>$1000$\times$ larger than any previous analog DFT demonstration. The analog FFT also provides more numerically precise DFTs with greater tolerance to device and circuit non-idealities than a direct matrix-vector multiplication approach. We show that the extension of the FFT algorithm to analog in-memory processors leads to design considerations that differ markedly from digital implementations, and that analog Fourier transforms have a substantial power efficiency advantage at all size scales over FFTs implemented on state-of-the-art digital hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19071v1-abstract-full').style.display = 'none'; document.getElementById('2409.19071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16283">arXiv:2409.16283</a> <span> [<a href="https://arxiv.org/pdf/2409.16283">pdf</a>, <a href="https://arxiv.org/format/2409.16283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Gen2Act: Human Video Generation in Novel Scenarios enables Generalizable Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bharadhwaj%2C+H">Homanga Bharadhwaj</a>, <a href="/search/cs?searchtype=author&query=Dwibedi%2C+D">Debidatta Dwibedi</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhinav Gupta</a>, <a href="/search/cs?searchtype=author&query=Tulsiani%2C+S">Shubham Tulsiani</a>, <a href="/search/cs?searchtype=author&query=Doersch%2C+C">Carl Doersch</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Ted Xiao</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+D">Dhruv Shah</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Fei Xia</a>, <a href="/search/cs?searchtype=author&query=Sadigh%2C+D">Dorsa Sadigh</a>, <a href="/search/cs?searchtype=author&query=Kirmani%2C+S">Sean Kirmani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16283v1-abstract-short" style="display: inline;"> How can robot manipulation policies generalize to novel tasks involving unseen object types and new motions? In this paper, we provide a solution in terms of predicting motion information from web data through human video generation and conditioning a robot policy on the generated video. Instead of attempting to scale robot data collection which is expensive, we show how we can leverage video gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16283v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16283v1-abstract-full" style="display: none;"> How can robot manipulation policies generalize to novel tasks involving unseen object types and new motions? In this paper, we provide a solution in terms of predicting motion information from web data through human video generation and conditioning a robot policy on the generated video. Instead of attempting to scale robot data collection which is expensive, we show how we can leverage video generation models trained on easily available web data, for enabling generalization. Our approach Gen2Act casts language-conditioned manipulation as zero-shot human video generation followed by execution with a single policy conditioned on the generated video. To train the policy, we use an order of magnitude less robot interaction data compared to what the video prediction model was trained on. Gen2Act doesn't require fine-tuning the video model at all and we directly use a pre-trained model for generating human videos. Our results on diverse real-world scenarios show how Gen2Act enables manipulating unseen object types and performing novel motions for tasks not present in the robot data. Videos are at https://homangab.github.io/gen2act/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16283v1-abstract-full').style.display = 'none'; document.getElementById('2409.16283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint. Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15911">arXiv:2409.15911</a> <span> [<a href="https://arxiv.org/pdf/2409.15911">pdf</a>, <a href="https://arxiv.org/format/2409.15911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Modular-based Strategy for Mitigating Gradient Conflicts in Simultaneous Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoqian Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yangfan Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianjin Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yuan Ge</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chen Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guocheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15911v2-abstract-short" style="display: inline;"> Simultaneous Speech Translation (SimulST) involves generating target language text while continuously processing streaming speech input, presenting significant real-time challenges. Multi-task learning is often employed to enhance SimulST performance but introduces optimization conflicts between primary and auxiliary tasks, potentially compromising overall efficiency. The existing model-level conf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15911v2-abstract-full').style.display = 'inline'; document.getElementById('2409.15911v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15911v2-abstract-full" style="display: none;"> Simultaneous Speech Translation (SimulST) involves generating target language text while continuously processing streaming speech input, presenting significant real-time challenges. Multi-task learning is often employed to enhance SimulST performance but introduces optimization conflicts between primary and auxiliary tasks, potentially compromising overall efficiency. The existing model-level conflict resolution methods are not well-suited for this task which exacerbates inefficiencies and leads to high GPU memory consumption. To address these challenges, we propose a Modular Gradient Conflict Mitigation (MGCM) strategy that detects conflicts at a finer-grained modular level and resolves them utilizing gradient projection. Experimental results demonstrate that MGCM significantly improves SimulST performance, particularly under medium and high latency conditions, achieving a 0.68 BLEU score gain in offline tasks. Additionally, MGCM reduces GPU memory consumption by over 95\% compared to other conflict mitigation methods, establishing it as a robust solution for SimulST tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15911v2-abstract-full').style.display = 'none'; document.getElementById('2409.15911v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14364">arXiv:2409.14364</a> <span> [<a href="https://arxiv.org/pdf/2409.14364">pdf</a>, <a href="https://arxiv.org/format/2409.14364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> More Effective LLM Compressed Tokens with Uniformly Spread Position Identifiers and Compression Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runsong Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Pengcheng Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Chunyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14364v2-abstract-short" style="display: inline;"> Compressing Transformer inputs into compressd tokens allows running LLMs with improved speed and cost efficiency. Based on the compression method ICAE, we carefully examine the position identifier choices for compressed tokens and also propose a new compression loss. We demonstrate empirically that our proposed methods achieve significantly higher compression ratios (15x compared to 4x for ICAE),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14364v2-abstract-full').style.display = 'inline'; document.getElementById('2409.14364v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14364v2-abstract-full" style="display: none;"> Compressing Transformer inputs into compressd tokens allows running LLMs with improved speed and cost efficiency. Based on the compression method ICAE, we carefully examine the position identifier choices for compressed tokens and also propose a new compression loss. We demonstrate empirically that our proposed methods achieve significantly higher compression ratios (15x compared to 4x for ICAE), while being able to attain comparable reconstruction performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14364v2-abstract-full').style.display = 'none'; document.getElementById('2409.14364v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11308">arXiv:2409.11308</a> <span> [<a href="https://arxiv.org/pdf/2409.11308">pdf</a>, <a href="https://arxiv.org/format/2409.11308">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SpMis: An Investigation of Synthetic Spoken Misinformation Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peizhuo Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Renqiang He</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Haorui He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huadi Zheng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jie Shi</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhizheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11308v1-abstract-short" style="display: inline;"> In recent years, speech generation technology has advanced rapidly, fueled by generative models and large-scale training techniques. While these developments have enabled the production of high-quality synthetic speech, they have also raised concerns about the misuse of this technology, particularly for generating synthetic misinformation. Current research primarily focuses on distinguishing machi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11308v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11308v1-abstract-full" style="display: none;"> In recent years, speech generation technology has advanced rapidly, fueled by generative models and large-scale training techniques. While these developments have enabled the production of high-quality synthetic speech, they have also raised concerns about the misuse of this technology, particularly for generating synthetic misinformation. Current research primarily focuses on distinguishing machine-generated speech from human-produced speech, but the more urgent challenge is detecting misinformation within spoken content. This task requires a thorough analysis of factors such as speaker identity, topic, and synthesis. To address this need, we conduct an initial investigation into synthetic spoken misinformation detection by introducing an open-source dataset, SpMis. SpMis includes speech synthesized from over 1,000 speakers across five common topics, utilizing state-of-the-art text-to-speech systems. Although our results show promising detection capabilities, they also reveal substantial challenges for practical implementation, underscoring the importance of ongoing research in this critical area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11308v1-abstract-full').style.display = 'none'; document.getElementById('2409.11308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10764">arXiv:2409.10764</a> <span> [<a href="https://arxiv.org/pdf/2409.10764">pdf</a>, <a href="https://arxiv.org/format/2409.10764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning for Smart Grid: A Survey on Applications and Potential Vulnerabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zikai Zhang</a>, <a href="/search/cs?searchtype=author&query=Rath%2C+S">Suman Rath</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiaohao Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tingsong Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10764v1-abstract-short" style="display: inline;"> The Smart Grid (SG) is a critical energy infrastructure that collects real-time electricity usage data to forecast future energy demands using information and communication technologies (ICT). Due to growing concerns about data security and privacy in SGs, federated learning (FL) has emerged as a promising training framework. FL offers a balance between privacy, efficiency, and accuracy in SGs by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10764v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10764v1-abstract-full" style="display: none;"> The Smart Grid (SG) is a critical energy infrastructure that collects real-time electricity usage data to forecast future energy demands using information and communication technologies (ICT). Due to growing concerns about data security and privacy in SGs, federated learning (FL) has emerged as a promising training framework. FL offers a balance between privacy, efficiency, and accuracy in SGs by enabling collaborative model training without sharing private data from IoT devices. In this survey, we thoroughly review recent advancements in designing FL-based SG systems across three stages: generation, transmission and distribution, and consumption. Additionally, we explore potential vulnerabilities that may arise when implementing FL in these stages. Finally, we discuss the gap between state-of-the-art FL research and its practical applications in SGs and propose future research directions. These focus on potential attack and defense strategies for FL-based SG systems and the need to build a robust FL-based SG infrastructure. Unlike traditional surveys that address security issues in centralized machine learning methods for SG systems, this survey specifically examines the applications and security concerns in FL-based SG systems for the first time. Our aim is to inspire further research into applications and improvements in the robustness of FL-based SG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10764v1-abstract-full').style.display = 'none'; document.getElementById('2409.10764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> C.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10365">arXiv:2409.10365</a> <span> [<a href="https://arxiv.org/pdf/2409.10365">pdf</a>, <a href="https://arxiv.org/format/2409.10365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robust image representations with counterfactual contrastive learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roschewitz%2C+M">M茅lanie Roschewitz</a>, <a href="/search/cs?searchtype=author&query=Ribeiro%2C+F+D+S">Fabio De Sousa Ribeiro</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Khara%2C+G">Galvin Khara</a>, <a href="/search/cs?searchtype=author&query=Glocker%2C+B">Ben Glocker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10365v1-abstract-short" style="display: inline;"> Contrastive pretraining can substantially increase model generalisation and downstream performance. However, the quality of the learned representations is highly dependent on the data augmentation strategy applied to generate positive pairs. Positive contrastive pairs should preserve semantic meaning while discarding unwanted variations related to the data acquisition domain. Traditional contrasti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10365v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10365v1-abstract-full" style="display: none;"> Contrastive pretraining can substantially increase model generalisation and downstream performance. However, the quality of the learned representations is highly dependent on the data augmentation strategy applied to generate positive pairs. Positive contrastive pairs should preserve semantic meaning while discarding unwanted variations related to the data acquisition domain. Traditional contrastive pipelines attempt to simulate domain shifts through pre-defined generic image transformations. However, these do not always mimic realistic and relevant domain variations for medical imaging such as scanner differences. To tackle this issue, we herein introduce counterfactual contrastive learning, a novel framework leveraging recent advances in causal image synthesis to create contrastive positive pairs that faithfully capture relevant domain variations. Our method, evaluated across five datasets encompassing both chest radiography and mammography data, for two established contrastive objectives (SimCLR and DINO-v2), outperforms standard contrastive learning in terms of robustness to acquisition shift. Notably, counterfactual contrastive learning achieves superior downstream performance on both in-distribution and on external datasets, especially for images acquired with scanners under-represented in the training set. Further experiments show that the proposed framework extends beyond acquisition shifts, with models trained with counterfactual contrastive learning substantially improving subgroup performance across biological sex. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10365v1-abstract-full').style.display = 'none'; document.getElementById('2409.10365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code available at https://github.com/biomedia-mira/counterfactual-contrastive/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08788">arXiv:2409.08788</a> <span> [<a href="https://arxiv.org/pdf/2409.08788">pdf</a>, <a href="https://arxiv.org/format/2409.08788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Electrocardiogram Report Generation and Question Answering via Retrieval-Augmented Self-Supervised Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jialu Tang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tong Xia</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yuan Lu</a>, <a href="/search/cs?searchtype=author&query=Mascolo%2C+C">Cecilia Mascolo</a>, <a href="/search/cs?searchtype=author&query=Saeed%2C+A">Aaqib Saeed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08788v1-abstract-short" style="display: inline;"> Interpreting electrocardiograms (ECGs) and generating comprehensive reports remain challenging tasks in cardiology, often requiring specialized expertise and significant time investment. To address these critical issues, we propose ECG-ReGen, a retrieval-based approach for ECG-to-text report generation and question answering. Our method leverages a self-supervised learning for the ECG encoder, ena… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08788v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08788v1-abstract-full" style="display: none;"> Interpreting electrocardiograms (ECGs) and generating comprehensive reports remain challenging tasks in cardiology, often requiring specialized expertise and significant time investment. To address these critical issues, we propose ECG-ReGen, a retrieval-based approach for ECG-to-text report generation and question answering. Our method leverages a self-supervised learning for the ECG encoder, enabling efficient similarity searches and report retrieval. By combining pre-training with dynamic retrieval and Large Language Model (LLM)-based refinement, ECG-ReGen effectively analyzes ECG data and answers related queries, with the potential of improving patient care. Experiments conducted on the PTB-XL and MIMIC-IV-ECG datasets demonstrate superior performance in both in-domain and cross-domain scenarios for report generation. Furthermore, our approach exhibits competitive performance on ECG-QA dataset compared to fully supervised methods when utilizing off-the-shelf LLMs for zero-shot question answering. This approach, effectively combining self-supervised encoder and LLMs, offers a scalable and efficient solution for accurate ECG interpretation, holding significant potential to enhance clinical decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08788v1-abstract-full').style.display = 'none'; document.getElementById('2409.08788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05585">arXiv:2409.05585</a> <span> [<a href="https://arxiv.org/pdf/2409.05585">pdf</a>, <a href="https://arxiv.org/format/2409.05585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Latent 3D Brain MRI Counterfactual </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wei Peng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Ribeiro%2C+F+D+S">Fabio De Sousa Ribeiro</a>, <a href="/search/cs?searchtype=author&query=Bosschieter%2C+T">Tomas Bosschieter</a>, <a href="/search/cs?searchtype=author&query=Adeli%2C+E">Ehsan Adeli</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Glocker%2C+B">Ben Glocker</a>, <a href="/search/cs?searchtype=author&query=Pohl%2C+K+M">Kilian M. Pohl</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05585v1-abstract-short" style="display: inline;"> The number of samples in structural brain MRI studies is often too small to properly train deep learning models. Generative models show promise in addressing this issue by effectively learning the data distribution and generating high-fidelity MRI. However, they struggle to produce diverse, high-quality data outside the distribution defined by the training data. One way to address the issue is usi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05585v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05585v1-abstract-full" style="display: none;"> The number of samples in structural brain MRI studies is often too small to properly train deep learning models. Generative models show promise in addressing this issue by effectively learning the data distribution and generating high-fidelity MRI. However, they struggle to produce diverse, high-quality data outside the distribution defined by the training data. One way to address the issue is using causal models developed for 3D volume counterfactuals. However, accurately modeling causality in high-dimensional spaces is a challenge so that these models generally generate 3D brain MRIS of lower quality. To address these challenges, we propose a two-stage method that constructs a Structural Causal Model (SCM) within the latent space. In the first stage, we employ a VQ-VAE to learn a compact embedding of the MRI volume. Subsequently, we integrate our causal model into this latent space and execute a three-step counterfactual procedure using a closed-form Generalized Linear Model (GLM). Our experiments conducted on real-world high-resolution MRI data (1mm) demonstrate that our method can generate high-quality 3D MRI counterfactuals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05585v1-abstract-full').style.display = 'none'; document.getElementById('2409.05585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04847">arXiv:2409.04847</a> <span> [<a href="https://arxiv.org/pdf/2409.04847">pdf</a>, <a href="https://arxiv.org/format/2409.04847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking The Training And Evaluation of Rich-Context Layout-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiaxin Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zixu Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tianjun Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yicong Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04847v1-abstract-short" style="display: inline;"> Recent advancements in generative models have significantly enhanced their capacity for image generation, enabling a wide range of applications such as image editing, completion and video editing. A specialized area within generative modeling is layout-to-image (L2I) generation, where predefined layouts of objects guide the generative process. In this study, we introduce a novel regional cross-att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04847v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04847v1-abstract-full" style="display: none;"> Recent advancements in generative models have significantly enhanced their capacity for image generation, enabling a wide range of applications such as image editing, completion and video editing. A specialized area within generative modeling is layout-to-image (L2I) generation, where predefined layouts of objects guide the generative process. In this study, we introduce a novel regional cross-attention module tailored to enrich layout-to-image generation. This module notably improves the representation of layout regions, particularly in scenarios where existing methods struggle with highly complex and detailed textual descriptions. Moreover, while current open-vocabulary L2I methods are trained in an open-set setting, their evaluations often occur in closed-set environments. To bridge this gap, we propose two metrics to assess L2I performance in open-vocabulary scenarios. Additionally, we conduct a comprehensive user study to validate the consistency of these metrics with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04847v1-abstract-full').style.display = 'none'; document.getElementById('2409.04847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03970">arXiv:2409.03970</a> <span> [<a href="https://arxiv.org/pdf/2409.03970">pdf</a>, <a href="https://arxiv.org/format/2409.03970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> A Hybrid Vectorized Merge Sort on ARM NEON </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jincheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tiaojie Xiao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Di Ma</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chunye Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03970v1-abstract-short" style="display: inline;"> Sorting algorithms are the most extensively researched topics in computer science and serve for numerous practical applications. Although various sorts have been proposed for efficiency, different architectures offer distinct flavors to the implementation of parallel sorting. In this paper, we propose a hybrid vectorized merge sort on ARM NEON, named NEON Merge Sort for short (NEON-MS). In detail,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03970v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03970v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03970v1-abstract-full" style="display: none;"> Sorting algorithms are the most extensively researched topics in computer science and serve for numerous practical applications. Although various sorts have been proposed for efficiency, different architectures offer distinct flavors to the implementation of parallel sorting. In this paper, we propose a hybrid vectorized merge sort on ARM NEON, named NEON Merge Sort for short (NEON-MS). In detail, according to the granted register functions, we first identify the optimal register number to avoid the register-to-memory access, due to the write-back of intermediate outcomes. More importantly, following the generic merge sort framework that primarily uses sorting network for column sort and merging networks for three types of vectorized merge, we further improve their structures for high efficiency in an unified asymmetry way: 1) it makes the optimal sorting networks with few comparators become possible; 2) hybrid implementation of both serial and vectorized merges incurs the pipeline with merge instructions highly interleaved. Experiments on a single FT2000+ core show that NEON-MS is 3.8 and 2.1 times faster than std::sort and boost::block\_sort, respectively, on average. Additionally, as compared to the parallel version of the latter, NEON-MS gains an average speedup of 1.25. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03970v1-abstract-full').style.display = 'none'; document.getElementById('2409.03970v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICA3PP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17377">arXiv:2408.17377</a> <span> [<a href="https://arxiv.org/pdf/2408.17377">pdf</a>, <a href="https://arxiv.org/format/2408.17377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NDP: Next Distribution Prediction as a More Broad Target </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ruan%2C+J">Junhao Ruan</a>, <a href="/search/cs?searchtype=author&query=Abudula%2C+A">Abudukeyumu Abudula</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinqiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchun Fan</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yuan Ge</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17377v1-abstract-short" style="display: inline;"> Large language models (LLMs) trained on next-token prediction (NTP) paradigm have demonstrated powerful capabilities. However, the existing NTP paradigm contains several limitations, particularly related to planned task complications and error propagation during inference. In our work, we extend the critique of NTP, highlighting its limitation also due to training with a narrow objective: the pred… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17377v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17377v1-abstract-full" style="display: none;"> Large language models (LLMs) trained on next-token prediction (NTP) paradigm have demonstrated powerful capabilities. However, the existing NTP paradigm contains several limitations, particularly related to planned task complications and error propagation during inference. In our work, we extend the critique of NTP, highlighting its limitation also due to training with a narrow objective: the prediction of a sub-optimal one-hot distribution. To support this critique, we conducted a pre-experiment treating the output distribution from powerful LLMs as efficient world data compression. By evaluating the similarity between the $n$-gram distribution and the one-hot distribution with LLMs, we observed that the $n$-gram distributions align more closely with the output distribution of LLMs. Based on this insight, we introduce Next Distribution Prediction (NDP), which uses $n$-gram distributions to replace the one-hot targets, enhancing learning without extra online training time. We conducted experiments across translation, general task, language transfer, and medical domain adaptation. Compared to NTP, NDP can achieve up to +2.97 COMET improvement in translation tasks, +0.61 average improvement in general tasks, and incredible +10.75 average improvement in the medical domain. This demonstrates the concrete benefits of addressing the target narrowing problem, pointing to a new direction for future work on improving NTP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17377v1-abstract-full').style.display = 'none'; document.getElementById('2408.17377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13006">arXiv:2408.13006</a> <span> [<a href="https://arxiv.org/pdf/2408.13006">pdf</a>, <a href="https://arxiv.org/format/2408.13006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Systematic Evaluation of LLM-as-a-Judge in LLM Alignment Tasks: Explainable Metrics and Diverse Prompt Templates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hui Wei</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shenghua He</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tian Xia</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+A">Andy Wong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jingyang Lin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Mei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13006v1-abstract-short" style="display: inline;"> Alignment approaches such as RLHF and DPO are actively investigated to align large language models (LLMs) with human preferences. Commercial large language models (LLMs) like GPT-4 have been recently employed to evaluate and compare different LLM alignment approaches. These models act as surrogates for human evaluators due to their promising abilities to approximate human preferences with remarkab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13006v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13006v1-abstract-full" style="display: none;"> Alignment approaches such as RLHF and DPO are actively investigated to align large language models (LLMs) with human preferences. Commercial large language models (LLMs) like GPT-4 have been recently employed to evaluate and compare different LLM alignment approaches. These models act as surrogates for human evaluators due to their promising abilities to approximate human preferences with remarkably faster feedback and lower costs. This methodology is referred to as LLM-as-a-judge. However, concerns regarding its reliability have emerged, attributed to LLM judges' biases and inconsistent decision-making. Previous research has sought to develop robust evaluation frameworks for assessing the reliability of LLM judges and their alignment with human preferences. However, the employed evaluation metrics often lack adequate explainability and fail to address the internal inconsistency of LLMs. Additionally, existing studies inadequately explore the impact of various prompt templates when applying LLM-as-a-judge methods, which leads to potentially inconsistent comparisons between different alignment algorithms. In this work, we systematically evaluate LLM judges on alignment tasks (e.g. summarization) by defining evaluation metrics with improved theoretical interpretability and disentangling reliability metrics with LLM internal inconsistency. We develop a framework to evaluate, compare, and visualize the reliability and alignment of LLM judges to provide informative observations that help choose LLM judges for alignment tasks. Our results indicate a significant impact of prompt templates on LLM judge performance, as well as a mediocre alignment level between the tested LLM judges and human evaluators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13006v1-abstract-full').style.display = 'none'; document.getElementById('2408.13006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint, under review. 17 pages, 7 figures, 16 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12109">arXiv:2408.12109</a> <span> [<a href="https://arxiv.org/pdf/2408.12109">pdf</a>, <a href="https://arxiv.org/format/2408.12109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RoVRM: A Robust Visual Reward Model Optimized via Auxiliary Textual Preference Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yang Gan</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yifu Huo</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yongyu Mu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Murun Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qiaozhi He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chunliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongran Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Q">Quan Du</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Di Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12109v1-abstract-short" style="display: inline;"> Large vision-language models (LVLMs) often fail to align with human preferences, leading to issues like generating misleading content without proper visual context (also known as hallucination). A promising solution to this problem is using human-preference alignment techniques, such as best-of-n sampling and reinforcement learning. However, these techniques face the difficulty arising from the sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12109v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12109v1-abstract-full" style="display: none;"> Large vision-language models (LVLMs) often fail to align with human preferences, leading to issues like generating misleading content without proper visual context (also known as hallucination). A promising solution to this problem is using human-preference alignment techniques, such as best-of-n sampling and reinforcement learning. However, these techniques face the difficulty arising from the scarcity of visual preference data, which is required to train a visual reward model (VRM). In this work, we continue the line of research. We present a Robust Visual Reward Model (RoVRM) which improves human-preference alignment for LVLMs. RoVRM leverages auxiliary textual preference data through a three-phase progressive training and optimal transport-based preference data selection to effectively mitigate the scarcity of visual preference data. We experiment with RoVRM on the commonly used vision-language tasks based on the LLaVA-1.5-7B and -13B models. Experimental results demonstrate that RoVRM consistently outperforms traditional VRMs. Furthermore, our three-phase progressive training and preference data selection approaches can yield consistent performance gains over ranking-based alignment techniques, such as direct preference optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12109v1-abstract-full').style.display = 'none'; document.getElementById('2408.12109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09490">arXiv:2408.09490</a> <span> [<a href="https://arxiv.org/pdf/2408.09490">pdf</a>, <a href="https://arxiv.org/format/2408.09490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Invariant Principle for Heterophilic Graph Structure Distribution Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jinluan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Teng Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09490v2-abstract-short" style="display: inline;"> Heterophilic Graph Neural Networks (HGNNs) have shown promising results for semi-supervised learning tasks on graphs. Notably, most real-world heterophilic graphs are composed of a mixture of nodes with different neighbor patterns, exhibiting local node-level homophilic and heterophilic structures. However, existing works are only devoted to designing better HGNN backbones or architectures for nod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09490v2-abstract-full').style.display = 'inline'; document.getElementById('2408.09490v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09490v2-abstract-full" style="display: none;"> Heterophilic Graph Neural Networks (HGNNs) have shown promising results for semi-supervised learning tasks on graphs. Notably, most real-world heterophilic graphs are composed of a mixture of nodes with different neighbor patterns, exhibiting local node-level homophilic and heterophilic structures. However, existing works are only devoted to designing better HGNN backbones or architectures for node classification tasks on heterophilic and homophilic graph benchmarks simultaneously, and their analyses of HGNN performance with respect to nodes are only based on the determined data distribution without exploring the effect caused by this structural difference between training and testing nodes. How to learn invariant node representations on heterophilic graphs to handle this structure difference or distribution shifts remains unexplored. In this paper, we first discuss the limitations of previous graph-based invariant learning methods from the perspective of data augmentation. Then, we propose \textbf{HEI}, a framework capable of generating invariant node representations through incorporating heterophily information to infer latent environments without augmentation, which are then used for invariant prediction, under heterophilic graph structure distribution shifts. We theoretically show that our proposed method can achieve guaranteed performance under heterophilic graph structure distribution shifts. Extensive experiments on various benchmarks and backbones can also demonstrate the effectiveness of our method compared with existing state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09490v2-abstract-full').style.display = 'none'; document.getElementById('2408.09490v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08313">arXiv:2408.08313</a> <span> [<a href="https://arxiv.org/pdf/2408.08313">pdf</a>, <a href="https://arxiv.org/format/2408.08313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Understand Symbolic Graphics Programs? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Zeju Qiu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haiwen Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T+Z">Tim Z. Xiao</a>, <a href="/search/cs?searchtype=author&query=Collins%2C+K+M">Katherine M. Collins</a>, <a href="/search/cs?searchtype=author&query=Tenenbaum%2C+J+B">Joshua B. Tenenbaum</a>, <a href="/search/cs?searchtype=author&query=Weller%2C+A">Adrian Weller</a>, <a href="/search/cs?searchtype=author&query=Black%2C+M+J">Michael J. Black</a>, <a href="/search/cs?searchtype=author&query=Sch%C3%B6lkopf%2C+B">Bernhard Sch枚lkopf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08313v2-abstract-short" style="display: inline;"> Against the backdrop of enthusiasm for large language models (LLMs), there is an urgent need to scientifically assess their capabilities and shortcomings. This is nontrivial in part because it is difficult to find tasks which the models have not encountered during training. Utilizing symbolic graphics programs, we propose a domain well-suited to test multiple spatial-semantic reasoning skills of L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08313v2-abstract-full').style.display = 'inline'; document.getElementById('2408.08313v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08313v2-abstract-full" style="display: none;"> Against the backdrop of enthusiasm for large language models (LLMs), there is an urgent need to scientifically assess their capabilities and shortcomings. This is nontrivial in part because it is difficult to find tasks which the models have not encountered during training. Utilizing symbolic graphics programs, we propose a domain well-suited to test multiple spatial-semantic reasoning skills of LLMs. Popular in computer graphics, these programs procedurally generate visual data. While LLMs exhibit impressive skills in general program synthesis and analysis, symbolic graphics programs offer a new layer of evaluation: they allow us to test an LLM's ability to answer different-grained semantic-level questions of the images or 3D geometries without a vision encoder. To semantically understand the symbolic programs, LLMs would need to possess the ability to "imagine" and reason how the corresponding graphics content would look with only the symbolic description. We use this task to evaluate LLMs by creating a large benchmark for the semantic visual understanding of symbolic graphics programs, built procedurally with minimal human effort. Particular emphasis is placed on transformations of images that leave the image level semantics invariant while introducing significant changes to the underlying program. We evaluate commercial and open-source LLMs on our benchmark to assess their ability to reason about visual output of programs, finding that LLMs considered stronger at reasoning generally perform better. Lastly, we introduce a novel method to improve this ability -- Symbolic Instruction Tuning (SIT), in which the LLM is finetuned with pre-collected instruction data on symbolic graphics programs. Interestingly, we find that SIT not only improves LLM's understanding on symbolic programs, but it also improves general reasoning ability on various other benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08313v2-abstract-full').style.display = 'none'; document.getElementById('2408.08313v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report v2 (46 pages, 24 figures, project page: https://sgp-bench.github.io/, substantial update from v1)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07249">arXiv:2408.07249</a> <span> [<a href="https://arxiv.org/pdf/2408.07249">pdf</a>, <a href="https://arxiv.org/format/2408.07249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zechen Bai</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tianjun Xiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pichao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Brox%2C+T">Thomas Brox</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07249v1-abstract-short" style="display: inline;"> In the rapidly expanding domain of web video content, the task of text-video retrieval has become increasingly critical, bridging the semantic gap between textual queries and video data. This paper introduces a novel data-centric approach, Generalized Query Expansion (GQE), to address the inherent information imbalance between text and video, enhancing the effectiveness of text-video retrieval sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07249v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07249v1-abstract-full" style="display: none;"> In the rapidly expanding domain of web video content, the task of text-video retrieval has become increasingly critical, bridging the semantic gap between textual queries and video data. This paper introduces a novel data-centric approach, Generalized Query Expansion (GQE), to address the inherent information imbalance between text and video, enhancing the effectiveness of text-video retrieval systems. Unlike traditional model-centric methods that focus on designing intricate cross-modal interaction mechanisms, GQE aims to expand the text queries associated with videos both during training and testing phases. By adaptively segmenting videos into short clips and employing zero-shot captioning, GQE enriches the training dataset with comprehensive scene descriptions, effectively bridging the data imbalance gap. Furthermore, during retrieval, GQE utilizes Large Language Models (LLM) to generate a diverse set of queries and a query selection module to filter these queries based on relevance and diversity, thus optimizing retrieval performance while reducing computational overhead. Our contributions include a detailed examination of the information imbalance challenge, a novel approach to query expansion in video-text datasets, and the introduction of a query selection strategy that enhances retrieval accuracy without increasing computational costs. GQE achieves state-of-the-art performance on several benchmarks, including MSR-VTT, MSVD, LSMDC, and VATEX, demonstrating the effectiveness of addressing text-video retrieval from a data-centric perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07249v1-abstract-full').style.display = 'none'; document.getElementById('2408.07249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages including appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02714">arXiv:2408.02714</a> <span> [<a href="https://arxiv.org/pdf/2408.02714">pdf</a>, <a href="https://arxiv.org/format/2408.02714">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MDM: Advancing Multi-Domain Distribution Matching for Automatic Modulation Recognition Dataset Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongwei Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajun Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yao Lu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tianhao Xia</a>, <a href="/search/cs?searchtype=author&query=Xuan%2C+Q">Qi Xuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yun Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoniu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02714v1-abstract-short" style="display: inline;"> Recently, deep learning technology has been successfully introduced into Automatic Modulation Recognition (AMR) tasks. However, the success of deep learning is all attributed to the training on large-scale datasets. Such a large amount of data brings huge pressure on storage, transmission and model training. In order to solve the problem of large amount of data, some researchers put forward the me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02714v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02714v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02714v1-abstract-full" style="display: none;"> Recently, deep learning technology has been successfully introduced into Automatic Modulation Recognition (AMR) tasks. However, the success of deep learning is all attributed to the training on large-scale datasets. Such a large amount of data brings huge pressure on storage, transmission and model training. In order to solve the problem of large amount of data, some researchers put forward the method of data distillation, which aims to compress large training data into smaller synthetic datasets to maintain its performance. While numerous data distillation techniques have been developed within the realm of image processing, the unique characteristics of signals set them apart. Signals exhibit distinct features across various domains, necessitating specialized approaches for their analysis and processing. To this end, a novel dataset distillation method--Multi-domain Distribution Matching (MDM) is proposed. MDM employs the Discrete Fourier Transform (DFT) to translate timedomain signals into the frequency domain, and then uses a model to compute distribution matching losses between the synthetic and real datasets, considering both the time and frequency domains. Ultimately, these two losses are integrated to update the synthetic dataset. We conduct extensive experiments on three AMR datasets. Experimental results show that, compared with baseline methods, our method achieves better performance under the same compression ratio. Furthermore, we conduct crossarchitecture generalization experiments on several models, and the experimental results show that our synthetic datasets can generalize well on other unseen models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02714v1-abstract-full').style.display = 'none'; document.getElementById('2408.02714v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02689">arXiv:2408.02689</a> <span> [<a href="https://arxiv.org/pdf/2408.02689">pdf</a>, <a href="https://arxiv.org/ps/2408.02689">ps</a>, <a href="https://arxiv.org/format/2408.02689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Spatio-Temporal Partial Sensing Forecast for Long-term Traffic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zibo Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhe Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zelin Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tingsong Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhengkun Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haibo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shigang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02689v1-abstract-short" style="display: inline;"> Traffic forecasting uses recent measurements by sensors installed at chosen locations to forecast the future road traffic. Existing work either assumes all locations are equipped with sensors or focuses on short-term forecast. This paper studies partial sensing traffic forecast of long-term traffic, assuming sensors only at some locations. The study is important in lowering the infrastructure inve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02689v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02689v1-abstract-full" style="display: none;"> Traffic forecasting uses recent measurements by sensors installed at chosen locations to forecast the future road traffic. Existing work either assumes all locations are equipped with sensors or focuses on short-term forecast. This paper studies partial sensing traffic forecast of long-term traffic, assuming sensors only at some locations. The study is important in lowering the infrastructure investment cost in traffic management since deploying sensors at all locations could incur prohibitively high cost. However, the problem is challenging due to the unknown distribution at unsensed locations, the intricate spatio-temporal correlation in long-term forecasting, as well as noise in data and irregularities in traffic patterns (e.g., road closure). We propose a Spatio-Temporal Partial Sensing (STPS) forecast model for long-term traffic prediction, with several novel contributions, including a rank-based embedding technique to capture irregularities and overcome noise, a spatial transfer matrix to overcome the spatial distribution shift from permanently sensed locations to unsensed locations, and a multi-step training process that utilizes all available data to successively refine the model parameters for better accuracy. Extensive experiments on several real-world traffic datasets demonstrate that STPS outperforms the state-of-the-art and achieves superior accuracy in partial sensing long-term forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02689v1-abstract-full').style.display = 'none'; document.getElementById('2408.02689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01944">arXiv:2408.01944</a> <span> [<a href="https://arxiv.org/pdf/2408.01944">pdf</a>, <a href="https://arxiv.org/ps/2408.01944">ps</a>, <a href="https://arxiv.org/format/2408.01944">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> RobNODDI: Robust NODDI Parameter Estimation with Adaptive Sampling under Continuous Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Taohui Xiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenxin Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jing Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+E">Enqing Dong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shanshan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01944v1-abstract-short" style="display: inline;"> Neurite Orientation Dispersion and Density Imaging (NODDI) is an important imaging technology used to evaluate the microstructure of brain tissue, which is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods perform parameter estimation through diffusion magnetic resonance imaging (dMRI) with a small number of diffusion gradie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01944v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01944v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01944v1-abstract-full" style="display: none;"> Neurite Orientation Dispersion and Density Imaging (NODDI) is an important imaging technology used to evaluate the microstructure of brain tissue, which is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods perform parameter estimation through diffusion magnetic resonance imaging (dMRI) with a small number of diffusion gradients. These methods speed up parameter estimation and improve accuracy. However, the diffusion directions used by most existing deep learning models during testing needs to be strictly consistent with the diffusion directions during training. This results in poor generalization and robustness of deep learning models in dMRI parameter estimation. In this work, we verify for the first time that the parameter estimation performance of current mainstream methods will significantly decrease when the testing diffusion directions and the training diffusion directions are inconsistent. A robust NODDI parameter estimation method with adaptive sampling under continuous representation (RobNODDI) is proposed. Furthermore, long short-term memory (LSTM) units and fully connected layers are selected to learn continuous representation signals. To this end, we use a total of 100 subjects to conduct experiments based on the Human Connectome Project (HCP) dataset, of which 60 are used for training, 20 are used for validation, and 20 are used for testing. The test results indicate that RobNODDI improves the generalization performance and robustness of the deep learning model, enhancing the stability and flexibility of deep learning NODDI parameter estimatimation applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01944v1-abstract-full').style.display = 'none'; document.getElementById('2408.01944v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01890">arXiv:2408.01890</a> <span> [<a href="https://arxiv.org/pdf/2408.01890">pdf</a>, <a href="https://arxiv.org/format/2408.01890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Cross-layer Attention Sharing for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yongyu Mu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuzhang Wu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchun Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hengyu Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qiaozhi He</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Murun Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01890v1-abstract-short" style="display: inline;"> As large language models (LLMs) evolve, the increase in model depth and parameter number leads to substantial redundancy. To enhance the efficiency of the attention mechanism, previous works primarily compress the KV cache or group attention heads, while largely overlooking redundancy between layers. Our comprehensive analyses across various LLMs show that highly similar attention patterns persist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01890v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01890v1-abstract-full" style="display: none;"> As large language models (LLMs) evolve, the increase in model depth and parameter number leads to substantial redundancy. To enhance the efficiency of the attention mechanism, previous works primarily compress the KV cache or group attention heads, while largely overlooking redundancy between layers. Our comprehensive analyses across various LLMs show that highly similar attention patterns persist within most layers. It's intuitive to save the computation by sharing attention weights across layers. However, further analysis reveals two challenges: (1) Directly sharing the weight matrix without carefully rearranging the attention heads proves to be ineffective; (2) Shallow layers are vulnerable to small deviations in attention weights. Driven by these insights, we introduce LiSA, a lightweight substitute for self-attention in well-trained LLMs. LiSA employs tiny feed-forward networks to align attention heads between adjacent layers and low-rank matrices to approximate differences in layer-wise attention weights. Evaluations encompassing 13 typical benchmarks demonstrate that LiSA maintains high response quality in terms of accuracy and perplexity while reducing redundant attention calculations within 53-84% of the total layers. Our implementations of LiSA achieve a 6X compression of Q and K, with maximum throughput improvements of 19.5% for LLaMA3-8B and 32.3% for LLaMA2-7B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01890v1-abstract-full').style.display = 'none'; document.getElementById('2408.01890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Working in process</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00041">arXiv:2408.00041</a> <span> [<a href="https://arxiv.org/pdf/2408.00041">pdf</a>, <a href="https://arxiv.org/format/2408.00041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Con4m: Context-aware Consistency Learning Framework for Segmented Time Series Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junru Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Tianyu Cao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahe Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhilong Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tao Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00041v2-abstract-short" style="display: inline;"> Time Series Classification (TSC) encompasses two settings: classifying entire sequences or classifying segmented subsequences. The raw time series for segmented TSC usually contain Multiple classes with Varying Duration of each class (MVD). Therefore, the characteristics of MVD pose unique challenges for segmented TSC, yet have been largely overlooked by existing works. Specifically, there exists… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00041v2-abstract-full').style.display = 'inline'; document.getElementById('2408.00041v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00041v2-abstract-full" style="display: none;"> Time Series Classification (TSC) encompasses two settings: classifying entire sequences or classifying segmented subsequences. The raw time series for segmented TSC usually contain Multiple classes with Varying Duration of each class (MVD). Therefore, the characteristics of MVD pose unique challenges for segmented TSC, yet have been largely overlooked by existing works. Specifically, there exists a natural temporal dependency between consecutive instances (segments) to be classified within MVD. However, mainstream TSC models rely on the assumption of independent and identically distributed (i.i.d.), focusing on independently modeling each segment. Additionally, annotators with varying expertise may provide inconsistent boundary labels, leading to unstable performance of noise-free TSC models. To address these challenges, we first formally demonstrate that valuable contextual information enhances the discriminative power of classification instances. Leveraging the contextual priors of MVD at both the data and label levels, we propose a novel consistency learning framework Con4m, which effectively utilizes contextual information more conducive to discriminating consecutive segments in segmented TSC tasks, while harmonizing inconsistent boundary labels for training. Extensive experiments across multiple datasets validate the effectiveness of Con4m in handling segmented TSC tasks on MVD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00041v2-abstract-full').style.display = 'none'; document.getElementById('2408.00041v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xia%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository