CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 501 results for author: <span class="mathjax">Tang, C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Tang%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tang, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tang%2C+C&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tang, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20748">arXiv:2503.20748</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.20748">pdf</a>, <a href="https://arxiv.org/format/2503.20748">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniSTD: Towards Unified Spatio-Temporal Learning across Diverse Disciplines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xinzhu Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+E">Encheng Su</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xiufeng Song</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei-Hong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20748v1-abstract-short" style="display: inline;"> Traditional spatiotemporal models generally rely on task-specific architectures, which limit their generalizability and scalability across diverse tasks due to domain-specific design requirements. In this paper, we introduce \textbf{UniSTD}, a unified Transformer-based framework for spatiotemporal modeling, which is inspired by advances in recent foundation models with the two-stage pretraining-th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20748v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20748v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20748v1-abstract-full" style="display: none;"> Traditional spatiotemporal models generally rely on task-specific architectures, which limit their generalizability and scalability across diverse tasks due to domain-specific design requirements. In this paper, we introduce \textbf{UniSTD}, a unified Transformer-based framework for spatiotemporal modeling, which is inspired by advances in recent foundation models with the two-stage pretraining-then-adaption paradigm. Specifically, our work demonstrates that task-agnostic pretraining on 2D vision and vision-text datasets can build a generalizable model foundation for spatiotemporal learning, followed by specialized joint training on spatiotemporal datasets to enhance task-specific adaptability. To improve the learning capabilities across domains, our framework employs a rank-adaptive mixture-of-expert adaptation by using fractional interpolation to relax the discrete variables so that can be optimized in the continuous space. Additionally, we introduce a temporal module to incorporate temporal dynamics explicitly. We evaluate our approach on a large-scale dataset covering 10 tasks across 4 disciplines, demonstrating that a unified spatiotemporal model can achieve scalable, cross-task learning and support up to 10 tasks simultaneously within one model while reducing training costs in multi-domain applications. Code will be available at https://github.com/1hunters/UniSTD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20748v1-abstract-full').style.display = 'none'; document.getElementById('2503.20748v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19951">arXiv:2503.19951</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.19951">pdf</a>, <a href="https://arxiv.org/format/2503.19951">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ACVUBench: Audio-Centric Video Understanding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+J">Jimin Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Changli Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peihan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yifan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19951v1-abstract-short" style="display: inline;"> Audio often serves as an auxiliary modality in video understanding tasks of audio-visual large language models (LLMs), merely assisting in the comprehension of visual information. However, a thorough understanding of videos significantly depends on auditory information, as audio offers critical context, emotional cues, and semantic meaning that visual data alone often lacks. This paper proposes an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19951v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19951v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19951v1-abstract-full" style="display: none;"> Audio often serves as an auxiliary modality in video understanding tasks of audio-visual large language models (LLMs), merely assisting in the comprehension of visual information. However, a thorough understanding of videos significantly depends on auditory information, as audio offers critical context, emotional cues, and semantic meaning that visual data alone often lacks. This paper proposes an audio-centric video understanding benchmark (ACVUBench) to evaluate the video comprehension capabilities of multimodal LLMs with a particular focus on auditory information. Specifically, ACVUBench incorporates 2,662 videos spanning 18 different domains with rich auditory information, together with over 13k high-quality human annotated or validated question-answer pairs. Moreover, ACVUBench introduces a suite of carefully designed audio-centric tasks, holistically testing the understanding of both audio content and audio-visual interactions in videos. A thorough evaluation across a diverse range of open-source and proprietary multimodal LLMs is performed, followed by the analyses of deficiencies in audio-visual LLMs. Demos are available at https://github.com/lark-png/ACVUBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19951v1-abstract-full').style.display = 'none'; document.getElementById('2503.19951v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15518">arXiv:2503.15518</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.15518">pdf</a>, <a href="https://arxiv.org/format/2503.15518">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Robot Character Generation and Adaptive Human-Robot Interaction with Personality Shaping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Cheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+S">Steven Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Kwok%2C+T+M">Thomas M. Kwok</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yue Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15518v2-abstract-short" style="display: inline;"> We present a novel framework for designing emotionally agile robots with dynamic personalities and memory-based learning, with the aim of performing adaptive and non-deterministic interactions with humans while conforming to shared social understanding. While existing work has largely focused on emotion recognition and static response systems, many approaches rely on sentiment analysis and action&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15518v2-abstract-full').style.display = 'inline'; document.getElementById('2503.15518v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15518v2-abstract-full" style="display: none;"> We present a novel framework for designing emotionally agile robots with dynamic personalities and memory-based learning, with the aim of performing adaptive and non-deterministic interactions with humans while conforming to shared social understanding. While existing work has largely focused on emotion recognition and static response systems, many approaches rely on sentiment analysis and action mapping frameworks that are pre-defined with limited dimensionality and fixed configurations, lacking the flexibility of dynamic personality traits and memory-enabled adaptation. Other systems are often restricted to limited modes of expression and fail to develop a causal relationship between human behavior and the robot&#39;s proactive physical actions, resulting in constrained adaptability and reduced responsiveness in complex, dynamic interactions. Our methodology integrates the Big Five Personality Traits, Appraisal Theory, and abstracted memory layers through Large Language Models (LLMs). The LLM generates a parameterized robot personality based on the Big Five, processes human language and sentiments, evaluates human behavior using Appraisal Theory, and generates emotions and selects appropriate actions adapted by historical context over time. We validated the framework by testing three robots with distinct personalities in identical background contexts and found that personality, appraisal, and memory influence the adaptability of human-robot interactions. The impact of the individual components was further validated through ablation tests. We conclude that this system enables robots to engage in meaningful and personalized interactions with users, and holds significant potential for applications in domains such as pet robots, assistive robots, educational robots, and collaborative functional robots, where cultivating tailored relationships and enriching user experiences are essential. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15518v2-abstract-full').style.display = 'none'; document.getElementById('2503.15518v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14938">arXiv:2503.14938</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.14938">pdf</a>, <a href="https://arxiv.org/format/2503.14938">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimal Transport Adapter Tuning for Bridging Modality Gaps in Few-Shot Remote Sensing Scene Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ji%2C+Z">Zhong Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Ci Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jingren Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Pang%2C+Y">Yanwei Pang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14938v1-abstract-short" style="display: inline;"> Few-Shot Remote Sensing Scene Classification (FS-RSSC) presents the challenge of classifying remote sensing images with limited labeled samples. Existing methods typically emphasize single-modal feature learning, neglecting the potential benefits of optimizing multi-modal representations. To address this limitation, we propose a novel Optimal Transport Adapter Tuning (OTAT) framework aimed at cons&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14938v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14938v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14938v1-abstract-full" style="display: none;"> Few-Shot Remote Sensing Scene Classification (FS-RSSC) presents the challenge of classifying remote sensing images with limited labeled samples. Existing methods typically emphasize single-modal feature learning, neglecting the potential benefits of optimizing multi-modal representations. To address this limitation, we propose a novel Optimal Transport Adapter Tuning (OTAT) framework aimed at constructing an ideal Platonic representational space through optimal transport (OT) theory. This framework seeks to harmonize rich visual information with less dense textual cues, enabling effective cross-modal information transfer and complementarity. Central to this approach is the Optimal Transport Adapter (OTA), which employs a cross-modal attention mechanism to enrich textual representations and facilitate subsequent better information interaction. By transforming the network optimization into an OT optimization problem, OTA establishes efficient pathways for balanced information exchange between modalities. Moreover, we introduce a sample-level Entropy-Aware Weighted (EAW) loss, which combines difficulty-weighted similarity scores with entropy-based regularization. This loss function provides finer control over the OT optimization process, enhancing its solvability and stability. Our framework offers a scalable and efficient solution for advancing multimodal learning in remote sensing applications. Extensive experiments on benchmark datasets demonstrate that OTAT achieves state-of-the-art performance in FS-RSSC, significantly improving the model performance and generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14938v1-abstract-full').style.display = 'none'; document.getElementById('2503.14938v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13956">arXiv:2503.13956</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.13956">pdf</a>, <a href="https://arxiv.org/format/2503.13956">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving LLM Video Understanding with 16 Frames Per Second </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Changli Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+J">Jimin Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zejun Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13956v1-abstract-short" style="display: inline;"> Human vision is dynamic and continuous. However, in video understanding with multimodal large language models (LLMs), existing methods primarily rely on static features extracted from images sampled at a fixed low frame rate of frame-per-second (FPS) $\leqslant$2, leading to critical visual information loss. In this paper, we introduce F-16, the first multimodal LLM designed for high-frame-rate vi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13956v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13956v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13956v1-abstract-full" style="display: none;"> Human vision is dynamic and continuous. However, in video understanding with multimodal large language models (LLMs), existing methods primarily rely on static features extracted from images sampled at a fixed low frame rate of frame-per-second (FPS) $\leqslant$2, leading to critical visual information loss. In this paper, we introduce F-16, the first multimodal LLM designed for high-frame-rate video understanding. By increasing the frame rate to 16 FPS and compressing visual tokens within each 1-second clip, F-16 efficiently captures dynamic visual features while preserving key semantic information. Experimental results demonstrate that higher frame rates considerably enhance video understanding across multiple benchmarks, providing a new approach to improving video LLMs beyond scaling model size or training data. F-16 achieves state-of-the-art performance among 7-billion-parameter video LLMs on both general and fine-grained video understanding benchmarks, such as Video-MME and TemporalBench. Furthermore, F-16 excels in complex spatiotemporal tasks, including high-speed sports analysis (\textit{e.g.}, basketball, football, gymnastics, and diving), outperforming SOTA proprietary visual models like GPT-4o and Gemini-1.5-pro. Additionally, we introduce a novel decoding method for F-16 that enables highly efficient low-frame-rate inference without requiring model retraining. Upon acceptance, we will release the source code, model checkpoints, and data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13956v1-abstract-full').style.display = 'none'; document.getElementById('2503.13956v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11619">arXiv:2503.11619</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.11619">pdf</a>, <a href="https://arxiv.org/format/2503.11619">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Tit-for-Tat: Safeguarding Large Vision-Language Models Against Jailbreak Attacks via Adversarial Defense </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hao%2C+S">Shuyang Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hooi%2C+B">Bryan Hooi</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chengcheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Y">Yujun Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11619v1-abstract-short" style="display: inline;"> Deploying large vision-language models (LVLMs) introduces a unique vulnerability: susceptibility to malicious attacks via visual inputs. However, existing defense methods suffer from two key limitations: (1) They solely focus on textual defenses, fail to directly address threats in the visual domain where attacks originate, and (2) the additional processing steps often incur significant computatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11619v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11619v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11619v1-abstract-full" style="display: none;"> Deploying large vision-language models (LVLMs) introduces a unique vulnerability: susceptibility to malicious attacks via visual inputs. However, existing defense methods suffer from two key limitations: (1) They solely focus on textual defenses, fail to directly address threats in the visual domain where attacks originate, and (2) the additional processing steps often incur significant computational overhead or compromise model performance on benign tasks. Building on these insights, we propose ESIII (Embedding Security Instructions Into Images), a novel methodology for transforming the visual space from a source of vulnerability into an active defense mechanism. Initially, we embed security instructions into defensive images through gradient-based optimization, obtaining security instructions in the visual dimension. Subsequently, we integrate security instructions from visual and textual dimensions with the input query. The collaboration between security instructions from different dimensions ensures comprehensive security protection. Extensive experiments demonstrate that our approach effectively fortifies the robustness of LVLMs against such attacks while preserving their performance on standard benign tasks and incurring an imperceptible increase in time costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11619v1-abstract-full').style.display = 'none'; document.getElementById('2503.11619v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09559">arXiv:2503.09559</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.09559">pdf</a>, <a href="https://arxiv.org/format/2503.09559">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> The R2D2 Deep Neural Network Series for Scalable Non-Cartesian Magnetic Resonance Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yiwei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Aghabiglou%2C+A">Amir Aghabiglou</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Shijie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Torki%2C+M">Motahare Torki</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=van+Heeswijk%2C+R+B">Ruud B. van Heeswijk</a>, <a href="/search/cs?searchtype=author&amp;query=Wiaux%2C+Y">Yves Wiaux</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09559v2-abstract-short" style="display: inline;"> We introduce the R2D2 Deep Neural Network (DNN) series paradigm for fast and scalable image reconstruction from highly-accelerated non-Cartesian k-space acquisitions in Magnetic Resonance Imaging (MRI). While unrolled DNN architectures provide a robust image formation approach via data-consistency layers, embedding non-uniform fast Fourier transform operators in a DNN can become impractical to tra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09559v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09559v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09559v2-abstract-full" style="display: none;"> We introduce the R2D2 Deep Neural Network (DNN) series paradigm for fast and scalable image reconstruction from highly-accelerated non-Cartesian k-space acquisitions in Magnetic Resonance Imaging (MRI). While unrolled DNN architectures provide a robust image formation approach via data-consistency layers, embedding non-uniform fast Fourier transform operators in a DNN can become impractical to train at large scale, e.g in 2D MRI with a large number of coils, or for higher-dimensional imaging. Plug-and-play approaches that alternate a learned denoiser blind to the measurement setting with a data-consistency step are not affected by this limitation but their highly iterative nature implies slow reconstruction. To address this scalability challenge, we leverage the R2D2 paradigm that was recently introduced to enable ultra-fast reconstruction for large-scale Fourier imaging in radio astronomy. R2D2&#39;s reconstruction is formed as a series of residual images iteratively estimated as outputs of DNN modules taking the previous iteration&#39;s data residual as input. The method can be interpreted as a learned version of the Matching Pursuit algorithm. A series of R2D2 DNN modules were sequentially trained in a supervised manner on the fastMRI dataset and validated for 2D multi-coil MRI in simulation and on real data, targeting highly under-sampled radial k-space sampling. Results suggest that a series with only few DNNs achieves superior reconstruction quality over its unrolled incarnation R2D2-Net (whose training is also much less scalable), and over the state-of-the-art diffusion-based &#34;Decomposed Diffusion Sampler&#34; approach (also characterised by a slower reconstruction process). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09559v2-abstract-full').style.display = 'none'; document.getElementById('2503.09559v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08165">arXiv:2503.08165</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.08165">pdf</a>, <a href="https://arxiv.org/format/2503.08165">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Generation of Animatable 3D Human Models with AvatarForge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinhang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+Y">Yu-Wing Tai</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chi-Keung Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08165v1-abstract-short" style="display: inline;"> We introduce AvatarForge, a framework for generating animatable 3D human avatars from text or image inputs using AI-driven procedural generation. While diffusion-based methods have made strides in general 3D object generation, they struggle with high-quality, customizable human avatars due to the complexity and diversity of human body shapes, poses, exacerbated by the scarcity of high-quality data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08165v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08165v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08165v1-abstract-full" style="display: none;"> We introduce AvatarForge, a framework for generating animatable 3D human avatars from text or image inputs using AI-driven procedural generation. While diffusion-based methods have made strides in general 3D object generation, they struggle with high-quality, customizable human avatars due to the complexity and diversity of human body shapes, poses, exacerbated by the scarcity of high-quality data. Additionally, animating these avatars remains a significant challenge for existing methods. AvatarForge overcomes these limitations by combining LLM-based commonsense reasoning with off-the-shelf 3D human generators, enabling fine-grained control over body and facial details. Unlike diffusion models which often rely on pre-trained datasets lacking precise control over individual human features, AvatarForge offers a more flexible approach, bringing humans into the iterative design and modeling loop, with its auto-verification system allowing for continuous refinement of the generated avatars, and thus promoting high accuracy and customization. Our evaluations show that AvatarForge outperforms state-of-the-art methods in both text- and image-to-avatar generation, making it a versatile tool for artistic creation and animation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08165v1-abstract-full').style.display = 'none'; document.getElementById('2503.08165v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07503">arXiv:2503.07503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.07503">pdf</a>, <a href="https://arxiv.org/format/2503.07503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Think Before You Segment: High-Quality Reasoning Segmentation with GPT Chain of Thoughts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kao%2C+S">Shiu-hong Kao</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+Y">Yu-Wing Tai</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chi-Keung Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07503v3-abstract-short" style="display: inline;"> Reasoning segmentation is a challenging vision-language task that aims to output the segmentation mask with respect to a complex, implicit, and even non-visual query text. Previous works incorporated multimodal Large Language Models (MLLMs) with segmentation models to approach the difficult problem. However, their segmentation quality often falls short in complex cases, particularly when dealing w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07503v3-abstract-full').style.display = 'inline'; document.getElementById('2503.07503v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07503v3-abstract-full" style="display: none;"> Reasoning segmentation is a challenging vision-language task that aims to output the segmentation mask with respect to a complex, implicit, and even non-visual query text. Previous works incorporated multimodal Large Language Models (MLLMs) with segmentation models to approach the difficult problem. However, their segmentation quality often falls short in complex cases, particularly when dealing with out-of-domain objects with intricate structures, blurry boundaries, occlusions, or high similarity with surroundings. In this paper, we introduce ThinkFirst, a training-free reasoning segmentation framework that leverages GPT&#39;s chain of thought to address these challenging cases. Our approach allows GPT-4o or other powerful MLLMs to generate a detailed, chain-of-thought description of an image. This summarized description is then passed to a language-instructed segmentation assistant to aid the segmentation process. Our framework allows users to easily interact with the segmentation agent using multimodal inputs, such as easy text and image scribbles, for successive refinement or communication. We evaluate the performance of ThinkFirst on diverse objects. Extensive experiments show that, this zero-shot-CoT approach significantly improves the vanilla reasoning segmentation agent, both qualitatively and quantitatively, while being less sensitive or critical to user-supplied prompts after Thinking First. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07503v3-abstract-full').style.display = 'none'; document.getElementById('2503.07503v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://cse.hkust.edu.hk/~skao/thinkfirst.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07323">arXiv:2503.07323</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.07323">pdf</a>, <a href="https://arxiv.org/format/2503.07323">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Path Navigation for Motion Agents with LLM Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yubo Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+Y">Yu-Wing Tai</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chi-Keung Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07323v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated strong generalizable reasoning and planning capabilities. However, their efficacies in spatial path planning and obstacle-free trajectory generation remain underexplored. Leveraging LLMs for navigation holds significant potential, given LLMs&#39; ability to handle unseen scenarios, support user-agent interactions, and provide global control across complex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07323v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07323v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07323v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated strong generalizable reasoning and planning capabilities. However, their efficacies in spatial path planning and obstacle-free trajectory generation remain underexplored. Leveraging LLMs for navigation holds significant potential, given LLMs&#39; ability to handle unseen scenarios, support user-agent interactions, and provide global control across complex systems, making them well-suited for agentic planning and humanoid motion generation. As one of the first studies in this domain, we explore the zero-shot navigation and path generation capabilities of LLMs by constructing a dataset and proposing an evaluation protocol. Specifically, we represent paths using anchor points connected by straight lines, enabling movement in various directions. This approach offers greater flexibility and practicality compared to previous methods while remaining simple and intuitive for LLMs. We demonstrate that, when tasks are well-structured in this manner, modern LLMs exhibit substantial planning proficiency in avoiding obstacles while autonomously refining navigation with the generated motion to reach the target. Further, this spatial reasoning ability of a single LLM motion agent interacting in a static environment can be seamlessly generalized in multi-motion agents coordination in dynamic environments. Unlike traditional approaches that rely on single-step planning or local policies, our training-free LLM-based method enables global, dynamic, closed-loop planning, and autonomously resolving collision issues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07323v1-abstract-full').style.display = 'none'; document.getElementById('2503.07323v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07217">arXiv:2503.07217</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.07217">pdf</a>, <a href="https://arxiv.org/format/2503.07217">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ReelWave: A Multi-Agent Framework Toward Professional Movie Sound Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chi-Keung Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+Y">Yu-Wing Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07217v1-abstract-short" style="display: inline;"> Film production is an important application for generative audio, where richer context is provided through multiple scenes. In ReelWave, we propose a multi-agent framework for audio generation inspired by the professional movie production process. We first capture semantic and temporal synchronized &#34;on-screen&#34; sound by training a prediction model that predicts three interpretable time-varying audi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07217v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07217v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07217v1-abstract-full" style="display: none;"> Film production is an important application for generative audio, where richer context is provided through multiple scenes. In ReelWave, we propose a multi-agent framework for audio generation inspired by the professional movie production process. We first capture semantic and temporal synchronized &#34;on-screen&#34; sound by training a prediction model that predicts three interpretable time-varying audio control signals comprising loudness, pitch, and timbre. These three parameters are subsequently specified as conditions by a cross-attention module. Then, our framework infers &#34;off-screen&#34; sound to complement the generation through cooperative interaction between communicative agents. Each agent takes up specific roles similar to the movie production team and is supervised by an agent called the director. Besides, we investigate when the conditional video consists of multiple scenes, a case frequently seen in videos extracted from movies of considerable length. Consequently, our framework can capture a richer context of audio generation conditioned on video clips extracted from movies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07217v1-abstract-full').style.display = 'none'; document.getElementById('2503.07217v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02698">arXiv:2503.02698</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.02698">pdf</a>, <a href="https://arxiv.org/format/2503.02698">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> FlowPlan: Zero-Shot Task Planning with LLM Flow Engineering for Robotic Instruction Following </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zijun Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+H">Hanjing Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02698v1-abstract-short" style="display: inline;"> Robotic instruction following tasks require seamless integration of visual perception, task planning, target localization, and motion execution. However, existing task planning methods for instruction following are either data-driven or underperform in zero-shot scenarios due to difficulties in grounding lengthy instructions into actionable plans under operational constraints. To address this, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02698v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02698v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02698v1-abstract-full" style="display: none;"> Robotic instruction following tasks require seamless integration of visual perception, task planning, target localization, and motion execution. However, existing task planning methods for instruction following are either data-driven or underperform in zero-shot scenarios due to difficulties in grounding lengthy instructions into actionable plans under operational constraints. To address this, we propose FlowPlan, a structured multi-stage LLM workflow that elevates zero-shot pipeline and bridges the performance gap between zero-shot and data-driven in-context learning methods. By decomposing the planning process into modular stages--task information retrieval, language-level reasoning, symbolic-level planning, and logical evaluation--FlowPlan generates logically coherent action sequences while adhering to operational constraints and further extracts contextual guidance for precise instance-level target localization. Benchmarked on the ALFRED and validated in real-world applications, our method achieves competitive performance relative to data-driven in-context learning methods and demonstrates adaptability across diverse environments. This work advances zero-shot task planning in robotic systems without reliance on labeled data. Project website: https://instruction-following-project.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02698v1-abstract-full').style.display = 'none'; document.getElementById('2503.02698v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02554">arXiv:2503.02554</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.02554">pdf</a>, <a href="https://arxiv.org/format/2503.02554">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Towards a robust R2D2 paradigm for radio-interferometric imaging: revisiting DNN training and architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aghabiglou%2C+A">Amir Aghabiglou</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+C+S">Chung San Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Dabbech%2C+A">Arwa Dabbech</a>, <a href="/search/cs?searchtype=author&amp;query=Wiaux%2C+Y">Yves Wiaux</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02554v1-abstract-short" style="display: inline;"> The R2D2 Deep Neural Network (DNN) series was recently introduced for image formation in radio interferometry. It can be understood as a learned version of CLEAN, whose minor cycles are substituted with DNNs. We revisit R2D2 on the grounds of series convergence, training methodology, and DNN architecture, improving its robustness in terms of generalisability beyond training conditions, capability&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02554v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02554v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02554v1-abstract-full" style="display: none;"> The R2D2 Deep Neural Network (DNN) series was recently introduced for image formation in radio interferometry. It can be understood as a learned version of CLEAN, whose minor cycles are substituted with DNNs. We revisit R2D2 on the grounds of series convergence, training methodology, and DNN architecture, improving its robustness in terms of generalisability beyond training conditions, capability to deliver high data fidelity, and epistemic uncertainty. Firstly, while still focusing on telescope-specific training, we enhance the learning process by randomising Fourier sampling integration times, incorporating multi-scan multi-noise configurations, and varying imaging settings, including pixel resolution and visibility-weighting scheme. Secondly, we introduce a convergence criterion whereby the reconstruction process stops when the data residual is compatible with noise, rather than simply using all available DNNs. This not only increases the reconstruction efficiency by reducing its computational cost, but also refines training by pruning out the data/image pairs for which optimal data fidelity is reached before training the next DNN. Thirdly, we substitute R2D2&#39;s early U-Net DNN with a novel architecture (U-WDSR) combining U-Net and WDSR, which leverages wide activation, dense connections, weight normalisation, and low-rank convolution to improve feature reuse and reconstruction precision. As previously, R2D2 was trained for monochromatic intensity imaging with the Very Large Array (VLA) at fixed $512 \times 512$ image size. Simulations on a wide range of inverse problems and a case study on real data reveal that the new R2D2 model consistently outperforms its earlier version in image reconstruction quality, data fidelity, and epistemic uncertainty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02554v1-abstract-full').style.display = 'none'; document.getElementById('2503.02554v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00508">arXiv:2503.00508</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2503.00508">pdf</a>, <a href="https://arxiv.org/format/2503.00508">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HGDiffuser: Efficient Task-Oriented Grasp Generation via Human-Guided Grasp Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+D">Dehao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+W">Wenlong Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00508v1-abstract-short" style="display: inline;"> Task-oriented grasping (TOG) is essential for robots to perform manipulation tasks, requiring grasps that are both stable and compliant with task-specific constraints. Humans naturally grasp objects in a task-oriented manner to facilitate subsequent manipulation tasks. By leveraging human grasp demonstrations, current methods can generate high-quality robotic parallel-jaw task-oriented grasps for&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00508v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00508v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00508v1-abstract-full" style="display: none;"> Task-oriented grasping (TOG) is essential for robots to perform manipulation tasks, requiring grasps that are both stable and compliant with task-specific constraints. Humans naturally grasp objects in a task-oriented manner to facilitate subsequent manipulation tasks. By leveraging human grasp demonstrations, current methods can generate high-quality robotic parallel-jaw task-oriented grasps for diverse objects and tasks. However, they still encounter challenges in maintaining grasp stability and sampling efficiency. These methods typically rely on a two-stage process: first performing exhaustive task-agnostic grasp sampling in the 6-DoF space, then applying demonstration-induced constraints (e.g., contact regions and wrist orientations) to filter candidates. This leads to inefficiency and potential failure due to the vast sampling space. To address this, we propose the Human-guided Grasp Diffuser (HGDiffuser), a diffusion-based framework that integrates these constraints into a guided sampling process. Through this approach, HGDiffuser directly generates 6-DoF task-oriented grasps in a single stage, eliminating exhaustive task-agnostic sampling. Furthermore, by incorporating Diffusion Transformer (DiT) blocks as the feature backbone, HGDiffuser improves grasp generation quality compared to MLP-based methods. Experimental results demonstrate that our approach significantly improves the efficiency of task-oriented grasp generation, enabling more effective transfer of human grasping strategies to robotic systems. To access the source code and supplementary videos, visit https://sites.google.com/view/hgdiffuser. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00508v1-abstract-full').style.display = 'none'; document.getElementById('2503.00508v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15801">arXiv:2502.15801</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15801">pdf</a>, <a href="https://arxiv.org/format/2502.15801">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> An explainable transformer circuit for compositional generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Cheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Lake%2C+B">Brenden Lake</a>, <a href="/search/cs?searchtype=author&amp;query=Jazayeri%2C+M">Mehrdad Jazayeri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15801v1-abstract-short" style="display: inline;"> Compositional generalization-the systematic combination of known components into novel structures-remains a core challenge in cognitive science and machine learning. Although transformer-based large language models can exhibit strong performance on certain compositional tasks, the underlying mechanisms driving these abilities remain opaque, calling into question their interpretability. In this wor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15801v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15801v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15801v1-abstract-full" style="display: none;"> Compositional generalization-the systematic combination of known components into novel structures-remains a core challenge in cognitive science and machine learning. Although transformer-based large language models can exhibit strong performance on certain compositional tasks, the underlying mechanisms driving these abilities remain opaque, calling into question their interpretability. In this work, we identify and mechanistically interpret the circuit responsible for compositional induction in a compact transformer. Using causal ablations, we validate the circuit and formalize its operation using a program-like description. We further demonstrate that this mechanistic understanding enables precise activation edits to steer the model&#39;s behavior predictably. Our findings advance the understanding of complex behaviors in transformers and highlight such insights can provide a direct pathway for model control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15801v1-abstract-full').style.display = 'none'; document.getElementById('2502.15801v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15601">arXiv:2502.15601</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.15601">pdf</a>, <a href="https://arxiv.org/format/2502.15601">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> WorldCraft: Photo-Realistic 3D World Creation and Customization via LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinhang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chi-Keung Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+Y">Yu-Wing Tai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15601v2-abstract-short" style="display: inline;"> Constructing photorealistic virtual worlds has applications across various fields, but it often requires the extensive labor of highly trained professionals to operate conventional 3D modeling software. To democratize this process, we introduce WorldCraft, a system where large language model (LLM) agents leverage procedural generation to create indoor and outdoor scenes populated with objects, all&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15601v2-abstract-full').style.display = 'inline'; document.getElementById('2502.15601v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15601v2-abstract-full" style="display: none;"> Constructing photorealistic virtual worlds has applications across various fields, but it often requires the extensive labor of highly trained professionals to operate conventional 3D modeling software. To democratize this process, we introduce WorldCraft, a system where large language model (LLM) agents leverage procedural generation to create indoor and outdoor scenes populated with objects, allowing users to control individual object attributes and the scene layout using intuitive natural language commands. In our framework, a coordinator agent manages the overall process and works with two specialized LLM agents to complete the scene creation: ForgeIt, which integrates an ever-growing manual through auto-verification to enable precise customization of individual objects, and ArrangeIt, which formulates hierarchical optimization problems to achieve a layout that balances ergonomic and aesthetic considerations. Additionally, our pipeline incorporates a trajectory control agent, allowing users to animate the scene and operate the camera through natural language interactions. Our system is also compatible with off-the-shelf deep 3D generators to enrich scene assets. Through evaluations and comparisons with state-of-the-art methods, we demonstrate the versatility of WorldCraft, ranging from single-object customization to intricate, large-scale interior and exterior scene designs. This system empowers non-professionals to bring their creative visions to life. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15601v2-abstract-full').style.display = 'none'; document.getElementById('2502.15601v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14776">arXiv:2502.14776</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14776">pdf</a>, <a href="https://arxiv.org/format/2502.14776">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SurveyX: Academic Survey Automation via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xun Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jiawei Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yezhaohui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zifan Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+S">Shichao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zehao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yebin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+S">Simin Niu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hanyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+B">Bo Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+K">Keming Mao</a>, <a href="/search/cs?searchtype=author&amp;query=li%2C+Z">Zhiyu li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14776v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated exceptional comprehension capabilities and a vast knowledge base, suggesting that LLMs can serve as efficient tools for automated survey generation. However, recent research related to automated survey generation remains constrained by some critical limitations like finite context window, lack of in-depth content discussion, and absence of systematic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14776v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14776v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14776v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated exceptional comprehension capabilities and a vast knowledge base, suggesting that LLMs can serve as efficient tools for automated survey generation. However, recent research related to automated survey generation remains constrained by some critical limitations like finite context window, lack of in-depth content discussion, and absence of systematic evaluation frameworks. Inspired by human writing processes, we propose SurveyX, an efficient and organized system for automated survey generation that decomposes the survey composing process into two phases: the Preparation and Generation phases. By innovatively introducing online reference retrieval, a pre-processing method called AttributeTree, and a re-polishing process, SurveyX significantly enhances the efficacy of survey composition. Experimental evaluation results show that SurveyX outperforms existing automated survey generation systems in content quality (0.259 improvement) and citation quality (1.76 enhancement), approaching human expert performance across multiple evaluation dimensions. Examples of surveys generated by SurveyX are available on www.surveyx.cn <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14776v2-abstract-full').style.display = 'none'; document.getElementById('2502.14776v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11775">arXiv:2502.11775</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11775">pdf</a>, <a href="https://arxiv.org/format/2502.11775">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> video-SALMONN-o1: Reasoning-enhanced Audio-visual Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+G">Guangzhi Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yudong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+J">Jimin Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Changli Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&amp;query=MA%2C+Z">Zejun MA</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11775v1-abstract-short" style="display: inline;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhance&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11775v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11775v1-abstract-full" style="display: none;"> While recent advancements in reasoning optimization have significantly enhanced the capabilities of large language models (LLMs), existing efforts to improve reasoning have been limited to solving mathematical problems and focusing on visual graphical inputs, neglecting broader applications in general video understanding.This paper proposes video-SALMONN-o1, the first open-source reasoning-enhanced audio-visual LLM designed for general video understanding tasks. To enhance its reasoning abilities, we develop a reasoning-intensive dataset featuring challenging audio-visual questions with step-by-step solutions. We also propose process direct preference optimization (pDPO), which leverages contrastive step selection to achieve efficient step-level reward modelling tailored for multimodal inputs. Additionally, we introduce RivaBench, the first reasoning-intensive video understanding benchmark, featuring over 4,000 high-quality, expert-curated question-answer pairs across scenarios such as standup comedy, academic presentations, and synthetic video detection. video-SALMONN-o1 achieves 3-8% accuracy improvements over the LLaVA-OneVision baseline across different video reasoning benchmarks. Besides, pDPO achieves 6-8% improvements compared to the supervised fine-tuning model on RivaBench. Enhanced reasoning enables video-SALMONN-o1 zero-shot synthetic video detection capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11775v1-abstract-full').style.display = 'none'; document.getElementById('2502.11775v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11744">arXiv:2502.11744</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11744">pdf</a>, <a href="https://arxiv.org/format/2502.11744">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FUNCTO: Function-Centric One-Shot Imitation Learning for Tool Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+A">Anxing Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Y">Yuhong Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+T">Tianrun Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+W">Wenlong Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hanbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+D">David Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11744v2-abstract-short" style="display: inline;"> Learning tool use from a single human demonstration video offers a highly intuitive and efficient approach to robot teaching. While humans can effortlessly generalize a demonstrated tool manipulation skill to diverse tools that support the same function (e.g., pouring with a mug versus a teapot), current one-shot imitation learning (OSIL) methods struggle to achieve this. A key challenge lies in e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11744v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11744v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11744v2-abstract-full" style="display: none;"> Learning tool use from a single human demonstration video offers a highly intuitive and efficient approach to robot teaching. While humans can effortlessly generalize a demonstrated tool manipulation skill to diverse tools that support the same function (e.g., pouring with a mug versus a teapot), current one-shot imitation learning (OSIL) methods struggle to achieve this. A key challenge lies in establishing functional correspondences between demonstration and test tools, considering significant geometric variations among tools with the same function (i.e., intra-function variations). To address this challenge, we propose FUNCTO (Function-Centric OSIL for Tool Manipulation), an OSIL method that establishes function-centric correspondences with a 3D functional keypoint representation, enabling robots to generalize tool manipulation skills from a single human demonstration video to novel tools with the same function despite significant intra-function variations. With this formulation, we factorize FUNCTO into three stages: (1) functional keypoint extraction, (2) function-centric correspondence establishment, and (3) functional keypoint-based action planning. We evaluate FUNCTO against exiting modular OSIL methods and end-to-end behavioral cloning methods through real-robot experiments on diverse tool manipulation tasks. The results demonstrate the superiority of FUNCTO when generalizing to novel tools with intra-function geometric variations. More details are available at https://sites.google.com/view/functo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11744v2-abstract-full').style.display = 'none'; document.getElementById('2502.11744v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10765">arXiv:2502.10765</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.10765">pdf</a>, <a href="https://arxiv.org/format/2502.10765">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Resource Allocation and Pricing for Blockchain-enabled Metaverse: A Stackelberg Game Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhanpeng Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+F">Feilong Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Changbing Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhongyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10765v1-abstract-short" style="display: inline;"> As the next-generation Internet paradigm, the metaverse can provide users with immersive physical-virtual experiences without spatial limitations. However, there are various concerns to be overcome, such as resource allocation, resource pricing, and transaction security issues. To address the above challenges, we integrate blockchain technology into the metaverse to manage and automate complex int&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10765v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10765v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10765v1-abstract-full" style="display: none;"> As the next-generation Internet paradigm, the metaverse can provide users with immersive physical-virtual experiences without spatial limitations. However, there are various concerns to be overcome, such as resource allocation, resource pricing, and transaction security issues. To address the above challenges, we integrate blockchain technology into the metaverse to manage and automate complex interactions effectively and securely utilizing the advantages of blockchain. With the objective of promoting the Quality of Experience (QoE), Metaverse Service Users (MSUs) purchase rendering and bandwidth resources from the Metaverse Service Provider (MSP) to access low-latency and high-quality immersive services. The MSP maximizes the profit by controlling the unit prices of resources. In this paper, we model the interaction between the MSP and MSUs as a Stackelberg game, in which the MSP acts as the leader and MSUs are followers. The existence of Stackelberg equilibrium is analyzed and proved mathematically. Besides, we propose an efficient greedy-and-search-based resource allocation and pricing algorithm (GSRAP) to solve the Stackelberg equilibrium (SE) point. Finally, we conduct extensive simulations to verify the effectiveness and efficiency of our designs. The experiment results show that our algorithm outperforms the baseline scheme in terms of improving the MSP&#39;s profit and convergence speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10765v1-abstract-full').style.display = 'none'; document.getElementById('2502.10765v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10634">arXiv:2502.10634</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.10634">pdf</a>, <a href="https://arxiv.org/format/2502.10634">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Lost in the Passage: Passage-level In-context Learning Does Not Necessarily Need a &#34;Passage&#34; </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chenming Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Gengyang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yunfang Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10634v1-abstract-short" style="display: inline;"> By simply incorporating demonstrations into the context, in-context learning (ICL) enables large language models (LLMs) to yield awesome performance on many tasks. In this paper, we focus on passage-level long-context ICL for generation tasks and find that LLMs cannot learn the intrinsic relationships between the demonstration passage and the generation output. We conduct experiments with differen&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10634v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10634v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10634v1-abstract-full" style="display: none;"> By simply incorporating demonstrations into the context, in-context learning (ICL) enables large language models (LLMs) to yield awesome performance on many tasks. In this paper, we focus on passage-level long-context ICL for generation tasks and find that LLMs cannot learn the intrinsic relationships between the demonstration passage and the generation output. We conduct experiments with different LLMs on two typical generation tasks including single-document QA and distractor generation, demonstrating that even a completely meaningless demonstration passage with 1/4 length achieves much better performance than the original full passage. Analysis via attention score reveals that LLMs pay little attention to passages compared to other components in prompt and little attention flows from the passage to other parts of the demonstration, which further confirms our finding. Additionally, experiments on context compression indicate that compression approaches proven effective on other long-context tasks are not suitable for passage-level ICL, since simply using shorter meaningless demonstration passages has achieved competitive performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10634v1-abstract-full').style.display = 'none'; document.getElementById('2502.10634v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05213">arXiv:2502.05213</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05213">pdf</a>, <a href="https://arxiv.org/format/2502.05213">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DERMARK: A Dynamic, Efficient and Robust Multi-bit Watermark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Q">Qihao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=zhang%2C+L">Lan zhang</a>, <a href="/search/cs?searchtype=author&amp;query=zhang%2C+J">Junyang zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiangyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05213v1-abstract-short" style="display: inline;"> Well-trained large language models (LLMs) present significant risks, including potential malicious use and copyright infringement. Current studies aim to trace the distribution of LLM-generated texts by implicitly embedding watermarks. Among these, the single-bit watermarking method can only determine whether a given text was generated by an LLM. In contrast, the multi-bit watermarking method embe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05213v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05213v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05213v1-abstract-full" style="display: none;"> Well-trained large language models (LLMs) present significant risks, including potential malicious use and copyright infringement. Current studies aim to trace the distribution of LLM-generated texts by implicitly embedding watermarks. Among these, the single-bit watermarking method can only determine whether a given text was generated by an LLM. In contrast, the multi-bit watermarking method embeds richer information into the generated text, which can identify which LLM generated and distributed a given text to which user. However, existing efforts embed the multi-bit watermark directly into the generated text without accounting for its watermarking capacity. This approach can result in embedding failures when the text&#39;s watermarking capacity is insufficient. In this paper, we derive the watermark embedding distribution based on the logits of LLMs and propose a formal inequality to segment the text optimally for watermark embedding. Building on this foundation, we propose DERMARK, a dynamic, efficient, and robust multi-bit watermarking method. DERMARK divides the text into segments of varying lengths for each bit embedding, adaptively matching the text&#39;s capacity. It achieves this with negligible overhead and robust performance against text editing by minimizing watermark extraction loss. Comprehensive experiments demonstrate that, compared to the SOTA method, our method reduces the number of tokens required for embedding each bit by 20\%, reduces watermark embedding time by 50\%, and is robust to text editing and watermark erasure attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05213v1-abstract-full').style.display = 'none'; document.getElementById('2502.05213v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04417">arXiv:2502.04417</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.04417">pdf</a>, <a href="https://arxiv.org/format/2502.04417">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NeuralMOVES: A lightweight and microscopic vehicle emission estimation model based on reverse engineering and surrogate learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ramirez-Sanchez%2C+E">Edgar Ramirez-Sanchez</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Catherine Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yaosheng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Renganathan%2C+N">Nrithya Renganathan</a>, <a href="/search/cs?searchtype=author&amp;query=Jayawardana%2C+V">Vindula Jayawardana</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zhengbing He</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Cathy Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04417v1-abstract-short" style="display: inline;"> The transportation sector significantly contributes to greenhouse gas emissions, necessitating accurate emission models to guide mitigation strategies. Despite its field validation and certification, the industry-standard Motor Vehicle Emission Simulator (MOVES) faces challenges related to complexity in usage, high computational demands, and its unsuitability for microscopic real-time applications&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04417v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04417v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04417v1-abstract-full" style="display: none;"> The transportation sector significantly contributes to greenhouse gas emissions, necessitating accurate emission models to guide mitigation strategies. Despite its field validation and certification, the industry-standard Motor Vehicle Emission Simulator (MOVES) faces challenges related to complexity in usage, high computational demands, and its unsuitability for microscopic real-time applications. To address these limitations, we present NeuralMOVES, a comprehensive suite of high-performance, lightweight surrogate models for vehicle CO2 emissions. Developed based on reverse engineering and Neural Networks, NeuralMOVES achieves a remarkable 6.013% Mean Average Percentage Error relative to MOVES across extensive tests spanning over two million scenarios with diverse trajectories and the factors regarding environments and vehicles. NeuralMOVES is only 2.4 MB, largely condensing the original MOVES and the reverse engineered MOVES into a compact representation, while maintaining high accuracy. Therefore, NeuralMOVES significantly enhances accessibility while maintaining the accuracy of MOVES, simplifying CO2 evaluation for transportation analyses and enabling real-time, microscopic applications across diverse scenarios without reliance on complex software or extensive computational resources. Moreover, this paper provides, for the first time, a framework for reverse engineering industrial-grade software tailored specifically to transportation scenarios, going beyond MOVES. The surrogate models are available at https://github.com/edgar-rs/neuralMOVES. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04417v1-abstract-full').style.display = 'none'; document.getElementById('2502.04417v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02234">arXiv:2502.02234</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.02234">pdf</a>, <a href="https://arxiv.org/format/2502.02234">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Mask-informed Deep Contrastive Incomplete Multi-view Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhenglai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yuqi Shi</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xiao He</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02234v1-abstract-short" style="display: inline;"> Multi-view clustering (MvC) utilizes information from multiple views to uncover the underlying structures of data. Despite significant advancements in MvC, mitigating the impact of missing samples in specific views on the integration of knowledge from different views remains a critical challenge. This paper proposes a novel Mask-informed Deep Contrastive Incomplete Multi-view Clustering (Mask-IMvC&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02234v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02234v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02234v1-abstract-full" style="display: none;"> Multi-view clustering (MvC) utilizes information from multiple views to uncover the underlying structures of data. Despite significant advancements in MvC, mitigating the impact of missing samples in specific views on the integration of knowledge from different views remains a critical challenge. This paper proposes a novel Mask-informed Deep Contrastive Incomplete Multi-view Clustering (Mask-IMvC) method, which elegantly identifies a view-common representation for clustering. Specifically, we introduce a mask-informed fusion network that aggregates incomplete multi-view information while considering the observation status of samples across various views as a mask, thereby reducing the adverse effects of missing values. Additionally, we design a prior knowledge-assisted contrastive learning loss that boosts the representation capability of the aggregated view-common representation by injecting neighborhood information of samples from different views. Finally, extensive experiments are conducted to demonstrate the superiority of the proposed Mask-IMvC method over state-of-the-art approaches across multiple MvC datasets, both in complete and incomplete scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02234v1-abstract-full').style.display = 'none'; document.getElementById('2502.02234v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14249">arXiv:2501.14249</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.14249">pdf</a>, <a href="https://arxiv.org/format/2501.14249">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Humanity&#39;s Last Exam </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&amp;query=Gatti%2C+A">Alice Gatti</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Josephina Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C+B+C">Chen Bo Calvin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shaaban%2C+M">Mohamed Shaaban</a>, <a href="/search/cs?searchtype=author&amp;query=Ling%2C+J">John Ling</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+S">Sean Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+M">Michael Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Agrawal%2C+A">Anish Agrawal</a>, <a href="/search/cs?searchtype=author&amp;query=Chopra%2C+A">Arnav Chopra</a>, <a href="/search/cs?searchtype=author&amp;query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+R">Ryan Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Hausenloy%2C+J">Jason Hausenloy</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Anderson%2C+D">Daron Anderson</a>, <a href="/search/cs?searchtype=author&amp;query=Shah%2C+I+A">Imad Ali Shah</a>, <a href="/search/cs?searchtype=author&amp;query=Doroshenko%2C+M">Mikhail Doroshenko</a>, <a href="/search/cs?searchtype=author&amp;query=Stokes%2C+A+C">Alun Cennyth Stokes</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmood%2C+M">Mobeen Mahmood</a> , et al. (709 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14249v5-abstract-short" style="display: inline;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity&#39;s Last Exam (HLE), a multi-modal benchmark at the frontier of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v5-abstract-full').style.display = 'inline'; document.getElementById('2501.14249v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14249v5-abstract-full" style="display: none;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity&#39;s Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 2,700 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v5-abstract-full').style.display = 'none'; document.getElementById('2501.14249v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14194">arXiv:2501.14194</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.14194">pdf</a>, <a href="https://arxiv.org/format/2501.14194">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ENTER: Event Based Interpretable Reasoning for VideoQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ayyubi%2C+H">Hammad Ayyubi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Junzhang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Asgarov%2C+A">Ali Asgarov</a>, <a href="/search/cs?searchtype=author&amp;query=Hakim%2C+Z+I+A">Zaber Ibn Abdul Hakim</a>, <a href="/search/cs?searchtype=author&amp;query=Sarker%2C+N+H">Najibul Haque Sarker</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhecan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chia-Wei Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Alomari%2C+H">Hani Alomari</a>, <a href="/search/cs?searchtype=author&amp;query=Atabuzzaman%2C+M">Md. Atabuzzaman</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+X">Xudong Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Dyava%2C+N+R">Naveen Reddy Dyava</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+S">Shih-Fu Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Thomas%2C+C">Chris Thomas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14194v1-abstract-short" style="display: inline;"> In this paper, we present ENTER, an interpretable Video Question Answering (VideoQA) system based on event graphs. Event graphs convert videos into graphical representations, where video events form the nodes and event-event relationships (temporal/causal/hierarchical) form the edges. This structured representation offers many benefits: 1) Interpretable VideoQA via generated code that parses event&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14194v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14194v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14194v1-abstract-full" style="display: none;"> In this paper, we present ENTER, an interpretable Video Question Answering (VideoQA) system based on event graphs. Event graphs convert videos into graphical representations, where video events form the nodes and event-event relationships (temporal/causal/hierarchical) form the edges. This structured representation offers many benefits: 1) Interpretable VideoQA via generated code that parses event-graph; 2) Incorporation of contextual visual information in the reasoning process (code generation) via event graphs; 3) Robust VideoQA via Hierarchical Iterative Update of the event graphs. Existing interpretable VideoQA systems are often top-down, disregarding low-level visual information in the reasoning plan generation, and are brittle. While bottom-up approaches produce responses from visual data, they lack interpretability. Experimental results on NExT-QA, IntentQA, and EgoSchema demonstrate that not only does our method outperform existing top-down approaches while obtaining competitive performance against bottom-up approaches, but more importantly, offers superior interpretability and explainability in the reasoning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14194v1-abstract-full').style.display = 'none'; document.getElementById('2501.14194v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12599">arXiv:2501.12599</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12599">pdf</a>, <a href="https://arxiv.org/format/2501.12599">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Kimi k1.5: Scaling Reinforcement Learning with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kimi+Team"> Kimi Team</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+A">Angang Du</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Xing%2C+B">Bowei Xing</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+C">Changjiu Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Cheng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+C">Chenjun Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+C">Chenzhuang Du</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+C">Chonghua Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chuning Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Congcong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Dehao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+E">Enming Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+E">Enzhe Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+F">Fengxiang Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Sung%2C+F">Flood Sung</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+G">Guangda Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+G">Guokun Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Haiqing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+H">Han Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+H">Hao Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+H">Hao Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a> , et al. (69 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12599v2-abstract-short" style="display: inline;"> Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior pu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12599v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12599v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12599v2-abstract-full" style="display: none;"> Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior published work has not produced competitive results. In light of this, we report on the training practice of Kimi k1.5, our latest multi-modal LLM trained with RL, including its RL training techniques, multi-modal data recipes, and infrastructure optimization. Long context scaling and improved policy optimization methods are key ingredients of our approach, which establishes a simplistic, effective RL framework without relying on more complex techniques such as Monte Carlo tree search, value functions, and process reward models. Notably, our system achieves state-of-the-art reasoning performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, 96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching OpenAI&#39;s o1. Moreover, we present effective long2short methods that use long-CoT techniques to improve short-CoT models, yielding state-of-the-art short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and Claude Sonnet 3.5 by a large margin (up to +550%). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12599v2-abstract-full').style.display = 'none'; document.getElementById('2501.12599v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11592">arXiv:2501.11592</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.11592">pdf</a>, <a href="https://arxiv.org/format/2501.11592">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training-free Ultra Small Model for Universal Sparse Reconstruction in Compressed Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chaoqing Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuang%2C+H">Huanze Zhuang</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+G">Guiyun Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zhenli Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenzhong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11592v2-abstract-short" style="display: inline;"> Pre-trained large models attract widespread attention in recent years, but they face challenges in applications that require high interpretability or have limited resources, such as physical sensing, medical imaging, and bioinformatics. Compressed Sensing (CS) is a well-proved theory that drives many recent breakthroughs in these applications. However, as a typical under-determined linear system,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11592v2-abstract-full').style.display = 'inline'; document.getElementById('2501.11592v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11592v2-abstract-full" style="display: none;"> Pre-trained large models attract widespread attention in recent years, but they face challenges in applications that require high interpretability or have limited resources, such as physical sensing, medical imaging, and bioinformatics. Compressed Sensing (CS) is a well-proved theory that drives many recent breakthroughs in these applications. However, as a typical under-determined linear system, CS suffers from excessively long sparse reconstruction times when using traditional iterative methods, particularly with large-scale data. Current AI methods like deep unfolding fail to substitute them because pre-trained models exhibit poor generality beyond their training conditions and dataset distributions, or lack interpretability. Instead of following the big model fervor, this paper proposes ultra-small artificial neural models called coefficients learning (CL), enabling training-free and rapid sparse reconstruction while perfectly inheriting the generality and interpretability of traditional iterative methods, bringing new feature of incorporating prior knowledges. In CL, a signal of length $n$ only needs a minimal of $n$ trainable parameters. A case study model called CLOMP is implemented for evaluation. Experiments are conducted on both synthetic and real one-dimensional and two-dimensional signals, demonstrating significant improvements in efficiency and accuracy. Compared to representative iterative methods, CLOMP improves efficiency by 100 to 1000 folds for large-scale data. Test results on eight diverse image datasets indicate that CLOMP improves structural similarity index by 292%, 98%, 45% for sampling rates of 0.1, 0.3, 0.5, respectively. We believe this method can truly usher CS reconstruction into the AI era, benefiting countless under-determined linear systems that rely on sparse solution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11592v2-abstract-full').style.display = 'none'; document.getElementById('2501.11592v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09347">arXiv:2501.09347</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.09347">pdf</a>, <a href="https://arxiv.org/format/2501.09347">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UVRM: A Scalable 3D Reconstruction Model from Unposed Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kao%2C+S">Shiu-hong Kao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jinglu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chi-Keung Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tai%2C+Y">Yu-Wing Tai</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09347v2-abstract-short" style="display: inline;"> Large Reconstruction Models (LRMs) have recently become a popular method for creating 3D foundational models. Training 3D reconstruction models with 2D visual data traditionally requires prior knowledge of camera poses for the training samples, a process that is both time-consuming and prone to errors. Consequently, 3D reconstruction training has been confined to either synthetic 3D datasets or sm&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09347v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09347v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09347v2-abstract-full" style="display: none;"> Large Reconstruction Models (LRMs) have recently become a popular method for creating 3D foundational models. Training 3D reconstruction models with 2D visual data traditionally requires prior knowledge of camera poses for the training samples, a process that is both time-consuming and prone to errors. Consequently, 3D reconstruction training has been confined to either synthetic 3D datasets or small-scale datasets with annotated poses. In this study, we investigate the feasibility of 3D reconstruction using unposed video data of various objects. We introduce UVRM, a novel 3D reconstruction model capable of being trained and evaluated on monocular videos without requiring any information about the pose. UVRM uses a transformer network to implicitly aggregate video frames into a pose-invariant latent feature space, which is then decoded into a tri-plane 3D representation. To obviate the need for ground-truth pose annotations during training, UVRM employs a combination of the score distillation sampling (SDS) method and an analysis-by-synthesis approach, progressively synthesizing pseudo novel-views using a pre-trained diffusion model. We qualitatively and quantitatively evaluate UVRM&#39;s performance on the G-Objaverse and CO3D datasets without relying on pose information. Extensive experiments show that UVRM is capable of effectively and efficiently reconstructing a wide range of 3D objects from unposed videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09347v2-abstract-full').style.display = 'none'; document.getElementById('2501.09347v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07890">arXiv:2501.07890</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.07890">pdf</a>, <a href="https://arxiv.org/format/2501.07890">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GRAPHMOE: Amplifying Cognitive Depth of Mixture-of-Experts Network via Introducing Self-Rethinking Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+B">Bo Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zifan Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bohao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+K">Kun Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+N">Ning Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaoxing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+N">Nayu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Jingchi Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07890v2-abstract-short" style="display: inline;"> Traditional Mixture-of-Experts (MoE) networks benefit from utilizing multiple smaller expert models as opposed to a single large network. However, these experts typically operate independently, leaving a question open about whether interconnecting these models could enhance the performance of MoE networks. In response, we introduce GRAPHMOE, a novel method aimed at augmenting the cognitive depth o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07890v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07890v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07890v2-abstract-full" style="display: none;"> Traditional Mixture-of-Experts (MoE) networks benefit from utilizing multiple smaller expert models as opposed to a single large network. However, these experts typically operate independently, leaving a question open about whether interconnecting these models could enhance the performance of MoE networks. In response, we introduce GRAPHMOE, a novel method aimed at augmenting the cognitive depth of language models via a self-rethinking mechanism constructed on Pseudo GraphMoE networks. GRAPHMOE employs a recurrent routing strategy to simulate iterative thinking steps, thereby facilitating the flow of information among expert nodes. We implement the GRAPHMOE architecture using Low-Rank Adaptation techniques (LoRA) and conduct extensive experiments on various benchmark datasets. The experimental results reveal that GRAPHMOE outperforms other LoRA based models, achieving state-of-the-art (SOTA) performance. Additionally, this study explores a novel recurrent routing strategy that may inspire further advancements in enhancing the reasoning capabilities of language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07890v2-abstract-full').style.display = 'none'; document.getElementById('2501.07890v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05339">arXiv:2501.05339</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.05339">pdf</a>, <a href="https://arxiv.org/format/2501.05339">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> JAQ: Joint Efficient Architecture Design and Low-Bit Quantization with Hardware-Software Co-Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Mingzi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Y">Yuan Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Y">Yijian Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yang Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yingxin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+X">Xun Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+W">Wenwu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05339v1-abstract-short" style="display: inline;"> The co-design of neural network architectures, quantization precisions, and hardware accelerators offers a promising approach to achieving an optimal balance between performance and efficiency, particularly for model deployment on resource-constrained edge devices. In this work, we propose the JAQ Framework, which jointly optimizes the three critical dimensions. However, effectively automating the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05339v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05339v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05339v1-abstract-full" style="display: none;"> The co-design of neural network architectures, quantization precisions, and hardware accelerators offers a promising approach to achieving an optimal balance between performance and efficiency, particularly for model deployment on resource-constrained edge devices. In this work, we propose the JAQ Framework, which jointly optimizes the three critical dimensions. However, effectively automating the design process across the vast search space of those three dimensions poses significant challenges, especially when pursuing extremely low-bit quantization. Specifical, the primary challenges include: (1) Memory overhead in software-side: Low-precision quantization-aware training can lead to significant memory usage due to storing large intermediate features and latent weights for back-propagation, potentially causing memory exhaustion. (2) Search time-consuming in hardware-side: The discrete nature of hardware parameters and the complex interplay between compiler optimizations and individual operators make the accelerator search time-consuming. To address these issues, JAQ mitigates the memory overhead through a channel-wise sparse quantization (CSQ) scheme, selectively applying quantization to the most sensitive components of the model during optimization. Additionally, JAQ designs BatchTile, which employs a hardware generation network to encode all possible tiling modes, thereby speeding up the search for the optimal compiler mapping strategy. Extensive experiments demonstrate the effectiveness of JAQ, achieving approximately 7% higher Top-1 accuracy on ImageNet compared to previous methods and reducing the hardware search time per iteration to 0.15 seconds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05339v1-abstract-full').style.display = 'none'; document.getElementById('2501.05339v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02564">arXiv:2501.02564</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.02564">pdf</a>, <a href="https://arxiv.org/format/2501.02564">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Balanced Multi-view Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhenglai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chang Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xinzhong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinwang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02564v3-abstract-short" style="display: inline;"> Multi-view clustering (MvC) aims to integrate information from different views to enhance the capability of the model in capturing the underlying data structures. The widely used joint training paradigm in MvC is potentially not fully leverage the multi-view information, since the imbalanced and under-optimized view-specific features caused by the uniform learning objective for all views. For inst&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02564v3-abstract-full').style.display = 'inline'; document.getElementById('2501.02564v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02564v3-abstract-full" style="display: none;"> Multi-view clustering (MvC) aims to integrate information from different views to enhance the capability of the model in capturing the underlying data structures. The widely used joint training paradigm in MvC is potentially not fully leverage the multi-view information, since the imbalanced and under-optimized view-specific features caused by the uniform learning objective for all views. For instance, particular views with more discriminative information could dominate the learning process in the joint training paradigm, leading to other views being under-optimized. To alleviate this issue, we first analyze the imbalanced phenomenon in the joint-training paradigm of multi-view clustering from the perspective of gradient descent for each view-specific feature extractor. Then, we propose a novel balanced multi-view clustering (BMvC) method, which introduces a view-specific contrastive regularization (VCR) to modulate the optimization of each view. Concretely, VCR preserves the sample similarities captured from the joint features and view-specific ones into the clustering distributions corresponding to view-specific features to enhance the learning process of view-specific feature extractors. Additionally, a theoretical analysis is provided to illustrate that VCR adaptively modulates the magnitudes of gradients for updating the parameters of view-specific feature extractors to achieve a balanced multi-view learning procedure. In such a manner, BMvC achieves a better trade-off between the exploitation of view-specific patterns and the exploration of view-invariance patterns to fully learn the multi-view information for the clustering task. Finally, a set of experiments are conducted to verify the superiority of the proposed method compared with state-of-the-art approaches on eight benchmark MvC datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02564v3-abstract-full').style.display = 'none'; document.getElementById('2501.02564v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16524">arXiv:2412.16524</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16524">pdf</a>, <a href="https://arxiv.org/format/2412.16524">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-SLT: Visual Language Tuning for Sign Language Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liang%2C+H">Han Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chengyu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yuecheng Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Cheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Juze Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jingyi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+L">Lan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16524v1-abstract-short" style="display: inline;"> In the realm of Sign Language Translation (SLT), reliance on costly gloss-annotated datasets has posed a significant barrier. Recent advancements in gloss-free SLT methods have shown promise, yet they often largely lag behind gloss-based approaches in terms of translation accuracy. To narrow this performance gap, we introduce LLaVA-SLT, a pioneering Large Multimodal Model (LMM) framework designed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16524v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16524v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16524v1-abstract-full" style="display: none;"> In the realm of Sign Language Translation (SLT), reliance on costly gloss-annotated datasets has posed a significant barrier. Recent advancements in gloss-free SLT methods have shown promise, yet they often largely lag behind gloss-based approaches in terms of translation accuracy. To narrow this performance gap, we introduce LLaVA-SLT, a pioneering Large Multimodal Model (LMM) framework designed to leverage the power of Large Language Models (LLMs) through effectively learned visual language embeddings. Our model is trained through a trilogy. First, we propose linguistic continued pretraining. We scale up the LLM and adapt it to the sign language domain using an extensive corpus dataset, effectively enhancing its textual linguistic knowledge about sign language. Then, we adopt visual contrastive pretraining to align the visual encoder with a large-scale pretrained text encoder. We propose hierarchical visual encoder that learns a robust word-level intermediate representation that is compatible with LLM token embeddings. Finally, we propose visual language tuning. We freeze pretrained models and employ a lightweight trainable MLP connector. It efficiently maps the pretrained visual language embeddings into the LLM token embedding space, enabling downstream SLT task. Our comprehensive experiments demonstrate that LLaVA-SLT outperforms the state-of-the-art methods. By using extra annotation-free data, it even closes to the gloss-based accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16524v1-abstract-full').style.display = 'none'; document.getElementById('2412.16524v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16487">arXiv:2412.16487</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16487">pdf</a>, <a href="https://arxiv.org/format/2412.16487">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Trusted Mamba Contrastive Network for Multi-View Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jian Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+X">Xin Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+L">Lei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhangmin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chang Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+L">Li-Rong Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16487v2-abstract-short" style="display: inline;"> Multi-view clustering can partition data samples into their categories by learning a consensus representation in an unsupervised way and has received more and more attention in recent years. However, there is an untrusted fusion problem. The reasons for this problem are as follows: 1) The current methods ignore the presence of noise or redundant information in the view; 2) The similarity of contra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16487v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16487v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16487v2-abstract-full" style="display: none;"> Multi-view clustering can partition data samples into their categories by learning a consensus representation in an unsupervised way and has received more and more attention in recent years. However, there is an untrusted fusion problem. The reasons for this problem are as follows: 1) The current methods ignore the presence of noise or redundant information in the view; 2) The similarity of contrastive learning comes from the same sample rather than the same cluster in deep multi-view clustering. It causes multi-view fusion in the wrong direction. This paper proposes a novel multi-view clustering network to address this problem, termed as Trusted Mamba Contrastive Network (TMCN). Specifically, we present a new Trusted Mamba Fusion Network (TMFN), which achieves a trusted fusion of multi-view data through a selective mechanism. Moreover, we align the fused representation and the view-specific representation using the Average-similarity Contrastive Learning (AsCL) module. AsCL increases the similarity of view presentation from the same cluster, not merely from the same sample. Extensive experiments show that the proposed method achieves state-of-the-art results in deep multi-view clustering tasks. The source code is available at https://github.com/HackerHyper/TMCN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16487v2-abstract-full').style.display = 'none'; document.getElementById('2412.16487v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing(ICASSP2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14526">arXiv:2412.14526</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.14526">pdf</a>, <a href="https://arxiv.org/format/2412.14526">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3672608.3707805">10.1145/3672608.3707805 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Knowledge Distillation in RNN-Attention Models for Early Prediction of Student Performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Leelaluk%2C+S">Sukrit Leelaluk</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Cheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=%C5%A0v%C3%A1bensk%C3%BD%2C+V">Valdemar 艩v谩bensk媒</a>, <a href="/search/cs?searchtype=author&amp;query=Shimada%2C+A">Atsushi Shimada</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14526v1-abstract-short" style="display: inline;"> Educational data mining (EDM) is a part of applied computing that focuses on automatically analyzing data from learning contexts. Early prediction for identifying at-risk students is a crucial and widely researched topic in EDM research. It enables instructors to support at-risk students to stay on track, preventing student dropout or failure. Previous studies have predicted students&#39; learning per&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14526v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14526v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14526v1-abstract-full" style="display: none;"> Educational data mining (EDM) is a part of applied computing that focuses on automatically analyzing data from learning contexts. Early prediction for identifying at-risk students is a crucial and widely researched topic in EDM research. It enables instructors to support at-risk students to stay on track, preventing student dropout or failure. Previous studies have predicted students&#39; learning performance to identify at-risk students by using machine learning on data collected from e-learning platforms. However, most studies aimed to identify at-risk students utilizing the entire course data after the course finished. This does not correspond to the real-world scenario that at-risk students may drop out before the course ends. To address this problem, we introduce an RNN-Attention-KD (knowledge distillation) framework to predict at-risk students early throughout a course. It leverages the strengths of Recurrent Neural Networks (RNNs) in handling time-sequence data to predict students&#39; performance at each time step and employs an attention mechanism to focus on relevant time steps for improved predictive accuracy. At the same time, KD is applied to compress the time steps to facilitate early prediction. In an empirical evaluation, RNN-Attention-KD outperforms traditional neural network models in terms of recall and F1-measure. For example, it obtained recall and F1-measure of 0.49 and 0.51 for Weeks 1--3 and 0.51 and 0.61 for Weeks 1--6 across all datasets from four years of a university course. Then, an ablation study investigated the contributions of different knowledge transfer methods (distillation objectives). We found that hint loss from the hidden layer of RNN and context vector loss from the attention module on RNN could enhance the model&#39;s prediction performance for identifying at-risk students. These results are relevant for EDM researchers employing deep learning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14526v1-abstract-full').style.display = 'none'; document.getElementById('2412.14526v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Proceedings of The 40th ACM/SIGAPP Symposium on Applied Computing (SAC &#39;25), see https://doi.org/10.1145/3672608.3707805</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.6; K.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10153">arXiv:2412.10153</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10153">pdf</a>, <a href="https://arxiv.org/format/2412.10153">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> EVOS: Efficient Implicit Neural Training via EVOlutionary Selector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+C">Chengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+S">Siyi Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ge%2C+S">Shijia Ge</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Mingzi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10153v1-abstract-short" style="display: inline;"> We propose EVOlutionary Selector (EVOS), an efficient training paradigm for accelerating Implicit Neural Representation (INR). Unlike conventional INR training that feeds all samples through the neural network in each iteration, our approach restricts training to strategically selected points, reducing computational overhead by eliminating redundant forward passes. Specifically, we treat each samp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10153v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10153v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10153v1-abstract-full" style="display: none;"> We propose EVOlutionary Selector (EVOS), an efficient training paradigm for accelerating Implicit Neural Representation (INR). Unlike conventional INR training that feeds all samples through the neural network in each iteration, our approach restricts training to strategically selected points, reducing computational overhead by eliminating redundant forward passes. Specifically, we treat each sample as an individual in an evolutionary process, where only those fittest ones survive and merit inclusion in training, adaptively evolving with the neural network dynamics. While this is conceptually similar to Evolutionary Algorithms, their distinct objectives (selection for acceleration vs. iterative solution optimization) require a fundamental redefinition of evolutionary mechanisms for our context. In response, we design sparse fitness evaluation, frequency-guided crossover, and augmented unbiased mutation to comprise EVOS. These components respectively guide sample selection with reduced computational cost, enhance performance through frequency-domain balance, and mitigate selection bias from cached evaluation. Extensive experiments demonstrate that our method achieves approximately 48%-66% reduction in training time while ensuring superior convergence without additional cost, establishing state-of-the-art acceleration among recent sampling-based strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10153v1-abstract-full').style.display = 'none'; document.getElementById('2412.10153v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08277">arXiv:2412.08277</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08277">pdf</a>, <a href="https://arxiv.org/ps/2412.08277">ps</a>, <a href="https://arxiv.org/format/2412.08277">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Analysis of Age of Information for A Discrete-Time hybrid Dual-Queue System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhengchuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+Y">Yi Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Pappas%2C+N">Nikolaos Pappas</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chaowei Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Min Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Quek%2C+T+Q+S">Tony Q. S. Quek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08277v1-abstract-short" style="display: inline;"> Using multiple sensors to update the status process of interest is promising in improving the information freshness. The unordered arrival of status updates at the monitor end poses a significant challenge in analyzing the timeliness performance of parallel updating systems. This work investigates the age of information (AoI) of a discrete-time dual-sensor status updating system. Specifically, the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08277v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08277v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08277v1-abstract-full" style="display: none;"> Using multiple sensors to update the status process of interest is promising in improving the information freshness. The unordered arrival of status updates at the monitor end poses a significant challenge in analyzing the timeliness performance of parallel updating systems. This work investigates the age of information (AoI) of a discrete-time dual-sensor status updating system. Specifically, the status update is generated following the zero-waiting policy. The two sensors are modeled as a geometrically distributed service time queue and a deterministic service time queue in parallel. We derive the analytical expressions for the average AoI and peak AoI using the graphical analysis method. Moreover, the connection of average AoI between discrete-time and continuous-time systems is also explored. It is shown that the AoI result of the continuous-time system is a limit case of that of the corresponding discrete-time system. Hence, the AoI result of the discrete-time system is more general than the continuous one. Numerical results validate the effectiveness of our analysis and further show that randomness of service time contributes more AoI reduction than determinacy of service time in dual-queue systems in most cases, which is different from what is known about the single-queue system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08277v1-abstract-full').style.display = 'none'; document.getElementById('2412.08277v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07589">arXiv:2412.07589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07589">pdf</a>, <a href="https://arxiv.org/format/2412.07589">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffSensei: Bridging Multi-Modal LLMs and Diffusion Models for Customized Manga Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jianzong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yanhong Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiangtai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tong%2C+Y">Yunhai Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07589v2-abstract-short" style="display: inline;"> Story visualization, the task of creating visual narratives from textual descriptions, has seen progress with text-to-image generation models. However, these models often lack effective control over character appearances and interactions, particularly in multi-character scenes. To address these limitations, we propose a new task: \textbf{customized manga generation} and introduce \textbf{DiffSense&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07589v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07589v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07589v2-abstract-full" style="display: none;"> Story visualization, the task of creating visual narratives from textual descriptions, has seen progress with text-to-image generation models. However, these models often lack effective control over character appearances and interactions, particularly in multi-character scenes. To address these limitations, we propose a new task: \textbf{customized manga generation} and introduce \textbf{DiffSensei}, an innovative framework specifically designed for generating manga with dynamic multi-character control. DiffSensei integrates a diffusion-based image generator with a multimodal large language model (MLLM) that acts as a text-compatible identity adapter. Our approach employs masked cross-attention to seamlessly incorporate character features, enabling precise layout control without direct pixel transfer. Additionally, the MLLM-based adapter adjusts character features to align with panel-specific text cues, allowing flexible adjustments in character expressions, poses, and actions. We also introduce \textbf{MangaZero}, a large-scale dataset tailored to this task, containing 43,264 manga pages and 427,147 annotated panels, supporting the visualization of varied character interactions and movements across sequential frames. Extensive experiments demonstrate that DiffSensei outperforms existing models, marking a significant advancement in manga generation by enabling text-adaptable character customization. The project page is https://jianzongwu.github.io/projects/diffsensei/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07589v2-abstract-full').style.display = 'none'; document.getElementById('2412.07589v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">[CVPR 2025] The project page is https://jianzongwu.github.io/projects/diffsensei/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07380">arXiv:2412.07380</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07380">pdf</a>, <a href="https://arxiv.org/format/2412.07380">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecFuse: Ensembling Large Language Models via Next-Segment Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lv%2C+B">Bo Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yue Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+P">Ping Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07380v2-abstract-short" style="display: inline;"> Ensembles of generative large language models (LLMs) can integrate the strengths of different LLMs to compensate for the limitations of individual models. However, recent work has focused on training an additional fusion model to combine complete responses from multiple LLMs, failing to tap into their collaborative potential to generate higher-quality responses. Moreover, as the additional fusion&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07380v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07380v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07380v2-abstract-full" style="display: none;"> Ensembles of generative large language models (LLMs) can integrate the strengths of different LLMs to compensate for the limitations of individual models. However, recent work has focused on training an additional fusion model to combine complete responses from multiple LLMs, failing to tap into their collaborative potential to generate higher-quality responses. Moreover, as the additional fusion model is trained on a specialized dataset, these methods struggle with generalizing to open-domain queries from online users. In this paper, we propose SpecFuse, a novel ensemble framework that outputs the fused result by iteratively producing the next segment through collaboration among LLMs. This is achieved through cyclic execution of its inference and verification components. In each round, the inference component invokes each base LLM to generate candidate segments in parallel, and the verify component calls these LLMs again to predict the ranking of the segments. The top-ranked segment is then broadcast to all LLMs, encouraging them to generate higher-quality segments in the next round. This approach also allows the base LLMs to be plug-and-play, without any training or adaptation, avoiding generalization limitations. Furthermore, to conserve computational resources, we propose a model exit mechanism that dynamically excludes models exhibiting poor performance in previous rounds during each query response. In this way, it effectively reduces the number of model calls while maintaining overall performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07380v2-abstract-full').style.display = 'none'; document.getElementById('2412.07380v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05808">arXiv:2412.05808</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05808">pdf</a>, <a href="https://arxiv.org/format/2412.05808">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SizeGS: Size-aware Compression of 3D Gaussians with Hierarchical Mixed Precision Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+S">Shuzhao Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jiahang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weixiang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ge%2C+S">Shijia Ge</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+S">Sicheng Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+Y">Yunpeng Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05808v1-abstract-short" style="display: inline;"> Effective compression technology is crucial for 3DGS to adapt to varying storage and transmission conditions. However, existing methods fail to address size constraints while maintaining optimal quality. In this paper, we introduce SizeGS, a framework that compresses 3DGS within a specified size budget while optimizing visual quality. We start with a size estimator to establish a clear relationshi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05808v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05808v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05808v1-abstract-full" style="display: none;"> Effective compression technology is crucial for 3DGS to adapt to varying storage and transmission conditions. However, existing methods fail to address size constraints while maintaining optimal quality. In this paper, we introduce SizeGS, a framework that compresses 3DGS within a specified size budget while optimizing visual quality. We start with a size estimator to establish a clear relationship between file size and hyperparameters. Leveraging this estimator, we incorporate mixed precision quantization (MPQ) into 3DGS attributes, structuring MPQ in two hierarchical level -- inter-attribute and intra-attribute -- to optimize visual quality under the size constraint. At the inter-attribute level, we assign bit-widths to each attribute channel by formulating the combinatorial optimization as a 0-1 integer linear program, which can be efficiently solved. At the intra-attribute level, we divide each attribute channel into blocks of vectors, quantizing each vector based on the optimal bit-width derived at the inter-attribute level. Dynamic programming determines block lengths. Using the size estimator and MPQ, we develop a calibrated algorithm to identify optimal hyperparameters in just 10 minutes, achieving a 1.69$\times$ efficiency increase with quality comparable to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05808v1-abstract-full').style.display = 'none'; document.getElementById('2412.05808v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Automatically compressing 3DGS into the desired file size while maximizing the visual quality</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05551">arXiv:2412.05551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05551">pdf</a>, <a href="https://arxiv.org/format/2412.05551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GAQAT: gradient-adaptive quantization-aware training for domain generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Jiacheng Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Y">Yuan Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+H">Han Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+W">Wenwu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05551v1-abstract-short" style="display: inline;"> Research on loss surface geometry, such as Sharpness-Aware Minimization (SAM), shows that flatter minima improve generalization. Recent studies further reveal that flatter minima can also reduce the domain generalization (DG) gap. However, existing flatness-based DG techniques predominantly operate within a full-precision training process, which is impractical for deployment on resource-constraine&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05551v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05551v1-abstract-full" style="display: none;"> Research on loss surface geometry, such as Sharpness-Aware Minimization (SAM), shows that flatter minima improve generalization. Recent studies further reveal that flatter minima can also reduce the domain generalization (DG) gap. However, existing flatness-based DG techniques predominantly operate within a full-precision training process, which is impractical for deployment on resource-constrained edge devices that typically rely on lower bit-width representations (e.g., 4 bits, 3 bits). Consequently, low-precision quantization-aware training is critical for optimizing these techniques in real-world applications. In this paper, we observe a significant degradation in performance when applying state-of-the-art DG-SAM methods to quantized models, suggesting that current approaches fail to preserve generalizability during the low-precision training process. To address this limitation, we propose a novel Gradient-Adaptive Quantization-Aware Training (GAQAT) framework for DG. Our approach begins by identifying the scale-gradient conflict problem in low-precision quantization, where the task loss and smoothness loss induce conflicting gradients for the scaling factors of quantizers, with certain layers exhibiting opposing gradient directions. This conflict renders the optimization of quantized weights highly unstable. To mitigate this, we further introduce a mechanism to quantify gradient inconsistencies and selectively freeze the gradients of scaling factors, thereby stabilizing the training process and enhancing out-of-domain generalization. Extensive experiments validate the effectiveness of the proposed GAQAT framework. On PACS, our 3-bit and 4-bit models outperform direct DG-QAT integration by up to 4.5%. On DomainNet, the 4-bit model achieves near-lossless performance compared to full precision, with improvements of 1.39% (4-bit) and 1.06% (3-bit) over the SOTA QAT baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05551v1-abstract-full').style.display = 'none'; document.getElementById('2412.05551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05460">arXiv:2412.05460</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.05460">pdf</a>, <a href="https://arxiv.org/format/2412.05460">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CigTime: Corrective Instruction Generation Through Inverse Motion Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Q">Qihang Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chengcheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Tekin%2C+B">Bugra Tekin</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yanchao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05460v1-abstract-short" style="display: inline;"> Recent advancements in models linking natural language with human motions have shown significant promise in motion generation and editing based on instructional text. Motivated by applications in sports coaching and motor skill learning, we investigate the inverse problem: generating corrective instructional text, leveraging motion editing and generation models. We introduce a novel approach that,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05460v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05460v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05460v1-abstract-full" style="display: none;"> Recent advancements in models linking natural language with human motions have shown significant promise in motion generation and editing based on instructional text. Motivated by applications in sports coaching and motor skill learning, we investigate the inverse problem: generating corrective instructional text, leveraging motion editing and generation models. We introduce a novel approach that, given a user&#39;s current motion (source) and the desired motion (target), generates text instructions to guide the user towards achieving the target motion. We leverage large language models to generate corrective texts and utilize existing motion generation and editing frameworks to compile datasets of triplets (source motion, target motion, and corrective text). Using this data, we propose a new motion-language model for generating corrective instructions. We present both qualitative and quantitative results across a diverse range of applications that largely improve upon baselines. Our approach demonstrates its effectiveness in instructional scenarios, offering text-based guidance to correct and enhance user performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05460v1-abstract-full').style.display = 'none'; document.getElementById('2412.05460v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 8 figures, NeurIPS 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.9 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04448">arXiv:2412.04448</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.04448">pdf</a>, <a href="https://arxiv.org/format/2412.04448">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+L">Longtao Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+J">Jiachun Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Z">Zhenxiong Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jiahao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chuanxin Tang</a>, <a href="/search/cs?searchtype=author&amp;query=An%2C+B">Bo An</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04448v1-abstract-short" style="display: inline;"> Recent advances in video diffusion models have unlocked new potential for realistic audio-driven talking video generation. However, achieving seamless audio-lip synchronization, maintaining long-term identity consistency, and producing natural, audio-aligned expressions in generated talking videos remain significant challenges. To address these challenges, we propose Memory-guided EMOtion-aware di&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04448v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04448v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04448v1-abstract-full" style="display: none;"> Recent advances in video diffusion models have unlocked new potential for realistic audio-driven talking video generation. However, achieving seamless audio-lip synchronization, maintaining long-term identity consistency, and producing natural, audio-aligned expressions in generated talking videos remain significant challenges. To address these challenges, we propose Memory-guided EMOtion-aware diffusion (MEMO), an end-to-end audio-driven portrait animation approach to generate identity-consistent and expressive talking videos. Our approach is built around two key modules: (1) a memory-guided temporal module, which enhances long-term identity consistency and motion smoothness by developing memory states to store information from a longer past context to guide temporal modeling via linear attention; and (2) an emotion-aware audio module, which replaces traditional cross attention with multi-modal attention to enhance audio-video interaction, while detecting emotions from audio to refine facial expressions via emotion adaptive layer norm. Extensive quantitative and qualitative results demonstrate that MEMO generates more realistic talking videos across diverse image and audio types, outperforming state-of-the-art methods in overall quality, audio-lip synchronization, identity consistency, and expression-emotion alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04448v1-abstract-full').style.display = 'none'; document.getElementById('2412.04448v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://memoavatar.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19000">arXiv:2411.19000</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.19000">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> An AI-driven multimodal smart home platform for continuous monitoring and intelligent assistance in post-stroke patients </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruizhi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zihe Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junliang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yanning Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Juan%2C+R">Ruoyu Juan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qiaoying Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+R">Ruimou Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xinkai Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yunjia Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jianan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+F">Fanghao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+N">Ninglli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hubin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Occhipinti%2C+L+G">Luigi G. Occhipinti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19000v2-abstract-short" style="display: inline;"> At-home rehabilitation for post-stroke patients presents significant challenges, as continuous, personalized care is often limited outside clinical settings. Additionally, the absence of comprehensive solutions addressing diverse monitoring and assistance needs in home environments complicates recovery efforts. Here, we present a multimodal smart home platform designed for continuous, at-home reha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19000v2-abstract-full').style.display = 'inline'; document.getElementById('2411.19000v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19000v2-abstract-full" style="display: none;"> At-home rehabilitation for post-stroke patients presents significant challenges, as continuous, personalized care is often limited outside clinical settings. Additionally, the absence of comprehensive solutions addressing diverse monitoring and assistance needs in home environments complicates recovery efforts. Here, we present a multimodal smart home platform designed for continuous, at-home rehabilitation of post-stroke patients, integrating wearable sensing, ambient monitoring, and adaptive automation. A plantar pressure insole equipped with a machine learning pipeline classifies users into motor recovery stages with up to 94% accuracy, enabling quantitative tracking of walking patterns. A head-mounted eye-tracking module supports cognitive assessments and hands-free control of household devices, while ambient sensors ensure sub-second response times for interaction. These data streams are fused locally via a hierarchical Internet of Things (IoT) architecture, protecting privacy and minimizing latency. An embedded large language model (LLM) agent, Auto-Care, continuously interprets multimodal data to provide real-time interventions-issuing personalized reminders, adjusting environmental conditions, and notifying caregivers. Implemented in a post-stroke context, this integrated smart home platform increases overall user satisfaction by an average of 115% (p&lt;0.01) compared to traditional home environment. Beyond stroke, the system offers a scalable framework for patient-centered, long-term care in broader neurorehabilitation and aging-in-place applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19000v2-abstract-full').style.display = 'none'; document.getElementById('2411.19000v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 41 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18612">arXiv:2411.18612</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18612">pdf</a>, <a href="https://arxiv.org/format/2411.18612">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Robust Offline Reinforcement Learning with Linearly Structured $f$-Divergence Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Cheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhishuai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+P">Pan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18612v1-abstract-short" style="display: inline;"> The Distributionally Robust Markov Decision Process (DRMDP) is a popular framework for addressing dynamics shift in reinforcement learning by learning policies robust to the worst-case transition dynamics within a constrained set. However, solving its dual optimization oracle poses significant challenges, limiting theoretical analysis and computational efficiency. The recently proposed Robust Regu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18612v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18612v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18612v1-abstract-full" style="display: none;"> The Distributionally Robust Markov Decision Process (DRMDP) is a popular framework for addressing dynamics shift in reinforcement learning by learning policies robust to the worst-case transition dynamics within a constrained set. However, solving its dual optimization oracle poses significant challenges, limiting theoretical analysis and computational efficiency. The recently proposed Robust Regularized Markov Decision Process (RRMDP) replaces the uncertainty set constraint with a regularization term on the value function, offering improved scalability and theoretical insights. Yet, existing RRMDP methods rely on unstructured regularization, often leading to overly conservative policies by considering transitions that are unrealistic. To address these issues, we propose a novel framework, the $d$-rectangular linear robust regularized Markov decision process ($d$-RRMDP), which introduces a linear latent structure into both transition kernels and regularization. For the offline RL setting, where an agent learns robust policies from a pre-collected dataset in the nominal environment, we develop a family of algorithms, Robust Regularized Pessimistic Value Iteration (R2PVI), employing linear function approximation and $f$-divergence based regularization terms on transition kernels. We provide instance-dependent upper bounds on the suboptimality gap of R2PVI policies, showing these bounds depend on how well the dataset covers state-action spaces visited by the optimal robust policy under robustly admissible transitions. This term is further shown to be fundamental to $d$-RRMDPs via information-theoretic lower bounds. Finally, numerical experiments validate that R2PVI learns robust policies and is computationally more efficient than methods for constrained DRMDPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18612v1-abstract-full').style.display = 'none'; document.getElementById('2411.18612v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">52 pages, 3 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18266">arXiv:2411.18266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18266">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Wearable intelligent throat enables natural speech in stroke patients with dysarthria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+W">Wentian Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yuxuan Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+X">Xiaoxue Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+S">Sixuan Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hongbei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Muzi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongyun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+N">Ningli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jin Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xiaodong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+W">Wenhui Song</a>, <a href="/search/cs?searchtype=author&amp;query=Birchall%2C+M">Martin Birchall</a>, <a href="/search/cs?searchtype=author&amp;query=Occhipinti%2C+L+G">Luigi G. Occhipinti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18266v3-abstract-short" style="display: inline;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to ena&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v3-abstract-full').style.display = 'inline'; document.getElementById('2411.18266v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18266v3-abstract-full" style="display: none;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to enable fluent, emotionally expressive communication. The system utilizes ultrasensitive textile strain sensors to capture high-quality signals from the neck area and supports token-level processing for real-time, continuous speech decoding, enabling seamless, delay-free communication. In tests with five stroke patients with dysarthria, IT&#39;s LLM agents intelligently corrected token errors and enriched sentence-level emotional and logical coherence, achieving low error rates (4.2% word error rate, 2.9% sentence error rate) and a 55% increase in user satisfaction. This work establishes a portable, intuitive communication platform for patients with dysarthria with the potential to be applied broadly across different neurological conditions and in multi-language support systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v3-abstract-full').style.display = 'none'; document.getElementById('2411.18266v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 45 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16786">arXiv:2411.16786</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16786">pdf</a>, <a href="https://arxiv.org/format/2411.16786">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Staleness-Centric Optimizations for Efficient Diffusion MoE Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiajun Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lizhuo Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jianru Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jiajun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+R">Rongwei Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16786v1-abstract-short" style="display: inline;"> Mixture-of-experts-based (MoE-based) diffusion models have shown their scalability and ability to generate high-quality images, making them a promising choice for efficient model scaling. However, they rely on expert parallelism across GPUs, necessitating efficient parallelism optimization. While state-of-the-art diffusion parallel inference methods overlap communication and computation via displa&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16786v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16786v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16786v1-abstract-full" style="display: none;"> Mixture-of-experts-based (MoE-based) diffusion models have shown their scalability and ability to generate high-quality images, making them a promising choice for efficient model scaling. However, they rely on expert parallelism across GPUs, necessitating efficient parallelism optimization. While state-of-the-art diffusion parallel inference methods overlap communication and computation via displaced operations, they introduce substantial staleness -- the utilization of outdated activations, which is especially severe in expert parallelism scenarios and leads to significant performance degradation. We identify this staleness issue and propose DICE, a staleness-centric optimization with a three-fold approach: (1) Interweaved Parallelism reduces step-level staleness for free while overlapping communication and computation; (2) Selective Synchronization operates at layer-level and protects critical layers vulnerable from staled activations; and (3) Conditional Communication, a token-level, training-free method that dynamically adjusts communication frequency based on token importance. Together, these optimizations effectively reduce staleness, achieving up to 1.2x speedup with minimal quality degradation. Our results establish DICE as an effective, scalable solution for large-scale MoE-based diffusion model inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16786v1-abstract-full').style.display = 'none'; document.getElementById('2411.16786v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16273">arXiv:2411.16273</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16273">pdf</a>, <a href="https://arxiv.org/format/2411.16273">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning for Motion Classification in Ankle Exoskeletons Using Surface EMG and IMU Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Est%C3%A9vez%2C+S+R">Silas Ruhrberg Est茅vez</a>, <a href="/search/cs?searchtype=author&amp;query=Mallah%2C+J">Jos茅e Mallah</a>, <a href="/search/cs?searchtype=author&amp;query=Kazieczko%2C+D">Dominika Kazieczko</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Occhipinti%2C+L+G">Luigi G. Occhipinti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16273v1-abstract-short" style="display: inline;"> Ankle exoskeletons have garnered considerable interest for their potential to enhance mobility and reduce fall risks, particularly among the aging population. The efficacy of these devices relies on accurate real-time prediction of the user&#39;s intended movements through sensor-based inputs. This paper presents a novel motion prediction framework that integrates three Inertial Measurement Units (IMU&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16273v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16273v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16273v1-abstract-full" style="display: none;"> Ankle exoskeletons have garnered considerable interest for their potential to enhance mobility and reduce fall risks, particularly among the aging population. The efficacy of these devices relies on accurate real-time prediction of the user&#39;s intended movements through sensor-based inputs. This paper presents a novel motion prediction framework that integrates three Inertial Measurement Units (IMUs) and eight surface Electromyography (sEMG) sensors to capture both kinematic and muscular activity data. A comprehensive set of activities, representative of everyday movements in barrier-free environments, was recorded for the purpose. Our findings reveal that Convolutional Neural Networks (CNNs) slightly outperform Long Short-Term Memory (LSTM) networks on a dataset of five motion tasks, achieving classification accuracies of $96.5 \pm 0.8 \%$ and $87.5 \pm 2.9 \%$, respectively. Furthermore, we demonstrate the system&#39;s proficiency in transfer learning, enabling accurate motion classification for new subjects using just ten samples per class for finetuning. The robustness of the model is demonstrated by its resilience to sensor failures resulting in absent signals, maintaining reliable performance in real-world scenarios. These results underscore the potential of deep learning algorithms to enhance the functionality and safety of ankle exoskeletons, ultimately improving their usability in daily life. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16273v1-abstract-full').style.display = 'none'; document.getElementById('2411.16273v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15871">arXiv:2411.15871</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15871">pdf</a>, <a href="https://arxiv.org/format/2411.15871">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Hiding Communication Cost in Distributed LLM Training via Micro-batch Co-execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haiquan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+C">Chaoyi Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jia He</a>, <a href="/search/cs?searchtype=author&amp;query=Ruan%2C+J">Jiaqi Ruan</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chengjie Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+X">Xiaosong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15871v1-abstract-short" style="display: inline;"> The growth of Large Language Models (LLMs) has necessitated large-scale distributed training. Highly optimized frameworks, however, still suffer significant losses in Model FLOPS utilization (often below 50%) due to large communication volumes. Meanwhile, our comprehensive profiling shows that the computation- and communication-intensive operators overlap well. This paper introduces DHelix, a no&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15871v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15871v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15871v1-abstract-full" style="display: none;"> The growth of Large Language Models (LLMs) has necessitated large-scale distributed training. Highly optimized frameworks, however, still suffer significant losses in Model FLOPS utilization (often below 50%) due to large communication volumes. Meanwhile, our comprehensive profiling shows that the computation- and communication-intensive operators overlap well. This paper introduces DHelix, a novel micro-structure that dramatically improves the efficiency of LLM training inspired by the DNA structure. Central to DHelix&#39;s design is Strand Interleaving (SI), which views the continuous stream of training micro-batches through a GPU as two strands. DHelix juxtaposes the forward and backward passes of the two strands and performs a systematic optimization for an SI plan that co-schedules the operators from the opposite strands, enabled by operator-level overlap profiling results and a dynamic-programming based search algorithm. Meanwhile, DHelix enables the two strands to share model states and space for activation data, effectively accommodating two micro-batches with under 3% extra memory space. Dhelix seamlessly integrates with all forms of existing data/model parallelism, the most challenging being pipeline parallelism, thanks to its unique model folding design that results in a W-shaped pipeline. We evaluate DHelix training with the popular Llama and GPT dense models, plus the Phi Mixture of Expert (MoE) model, across 3 GPU clusters (A40, A800, and H100). Results show that it achieves 12-40% (up to 58% MFU) and 2-29% (up to 71% MFU) improvement on the 64-A40 and 64-A800 clusters, respectively, significantly outperforming state-of-the-art methods. On the H100 cluster, though the faster network reduces DHelix&#39;s profit margin, it makes cross-node tensor parallelism promising, a practice currently prohibitive due to communication costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15871v1-abstract-full').style.display = 'none'; document.getElementById('2411.15871v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15127">arXiv:2411.15127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15127">pdf</a>, <a href="https://arxiv.org/format/2411.15127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PRIMUS: Pretraining IMU Encoders with Multimodal Self-Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Das%2C+A+M">Arnav M. Das</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C+I">Chi Ian Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Kawsar%2C+F">Fahim Kawsar</a>, <a href="/search/cs?searchtype=author&amp;query=Malekzadeh%2C+M">Mohammad Malekzadeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15127v2-abstract-short" style="display: inline;"> Sensing human motions through Inertial Measurement Units (IMUs) embedded in personal devices has enabled significant applications in health and wellness. Labeled IMU data is scarce, however, unlabeled or weakly labeled IMU data can be used to model human motions. For video or text modalities, the &#34;pretrain and adapt&#34; approach utilizes large volumes of unlabeled or weakly labeled data to build a st&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15127v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15127v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15127v2-abstract-full" style="display: none;"> Sensing human motions through Inertial Measurement Units (IMUs) embedded in personal devices has enabled significant applications in health and wellness. Labeled IMU data is scarce, however, unlabeled or weakly labeled IMU data can be used to model human motions. For video or text modalities, the &#34;pretrain and adapt&#34; approach utilizes large volumes of unlabeled or weakly labeled data to build a strong feature extractor, followed by adaptation to specific tasks using limited labeled data. However, pretraining methods are poorly understood for IMU data, and pipelines are rarely evaluated on out-of-domain tasks. We propose PRIMUS: a method for PRetraining IMU encoderS that uses a novel pretraining objective that is empirically validated based on downstream performance on both in-domain and out-of-domain datasets. The PRIMUS objective effectively enhances downstream performance by combining self-supervision, multimodal, and nearest-neighbor supervision. With fewer than 500 labeled samples per class, PRIMUS improves test accuracy by up to 15%, compared to state-of-the-art baselines. To benefit the broader community, we have open-sourced our code at github.com/nokia-bell-labs/pretrained-imu-encoders. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15127v2-abstract-full').style.display = 'none'; document.getElementById('2411.15127v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at ICASSP 2025. Also presented under the title &#34;PRIMUS: Pretraining IMU Encoders with Multimodal and Self-Supervised Learning&#34; at NeurIPS 2024 TSALM Workshop (Time Series in the Age of Large Models)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Tang%2C+C&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10