Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,473 results for author: <span class="mathjax">Chen, B</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14747">arXiv:2502.14747</a> <span> [<a href="https://arxiv.org/pdf/2502.14747">pdf</a>, <a href="https://arxiv.org/format/2502.14747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714148">10.1145/3706598.3714148 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AIdeation: Designing a Human-AI Collaborative Ideation System for Concept Designers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen-Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chien-Ting Lu</a>, <a href="/search/cs?searchtype=author&query=Campany%C3%A0%2C+N+P">Nil Ponsa Campany脿</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bing-Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M+Y">Mike Y. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14747v1-abstract-short" style="display: inline;"> Concept designers in the entertainment industry create highly detailed, often imaginary environments for movies, games, and TV shows. Their early ideation phase requires intensive research, brainstorming, visual exploration, and combination of various design elements to form cohesive designs. However, existing AI tools focus on image generation from user specifications, lacking support for the uni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14747v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14747v1-abstract-full" style="display: none;"> Concept designers in the entertainment industry create highly detailed, often imaginary environments for movies, games, and TV shows. Their early ideation phase requires intensive research, brainstorming, visual exploration, and combination of various design elements to form cohesive designs. However, existing AI tools focus on image generation from user specifications, lacking support for the unique needs and complexity of concept designers' workflows. Through a formative study with 12 professional designers, we captured their workflows and identified key requirements for AI-assisted ideation tools. Leveraging these insights, we developed AIdeation to support early ideation by brainstorming design concepts with flexible searching and recombination of reference images. A user study with 16 professional designers showed that AIdeation significantly enhanced creativity, ideation efficiency, and satisfaction (all p<.01) compared to current tools and workflows. A field study with 4 studios for 1 week provided insights into AIdeation's benefits and limitations in real-world projects. After the completion of the field study, two studios, covering films, television, and games, have continued to use AIdeation in their commercial projects to date, further validating AIdeation's improvement in ideation quality and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14747v1-abstract-full').style.display = 'none'; document.getElementById('2502.14747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted ACM CHI Conference on Human Factors in Computing Systems (CHI '25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13572">arXiv:2502.13572</a> <span> [<a href="https://arxiv.org/pdf/2502.13572">pdf</a>, <a href="https://arxiv.org/format/2502.13572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Improving the Sparse Structure Learning of Spiking Neural Networks from the View of Compression Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiangrong Shen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Badong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13572v1-abstract-short" style="display: inline;"> The human brain utilizes spikes for information transmission and dynamically reorganizes its network structure to boost energy efficiency and cognitive capabilities throughout its lifespan. Drawing inspiration from this spike-based computation, Spiking Neural Networks (SNNs) have been developed to construct event-driven models that emulate this efficiency. Despite these advances, deep SNNs continu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13572v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13572v1-abstract-full" style="display: none;"> The human brain utilizes spikes for information transmission and dynamically reorganizes its network structure to boost energy efficiency and cognitive capabilities throughout its lifespan. Drawing inspiration from this spike-based computation, Spiking Neural Networks (SNNs) have been developed to construct event-driven models that emulate this efficiency. Despite these advances, deep SNNs continue to suffer from over-parameterization during training and inference, a stark contrast to the brain's ability to self-organize. Furthermore, existing sparse SNNs are challenged by maintaining optimal pruning levels due to a static pruning ratio, resulting in either under- or over-pruning. In this paper, we propose a novel two-stage dynamic structure learning approach for deep SNNs, aimed at maintaining effective sparse training from scratch while optimizing compression efficiency. The first stage evaluates the compressibility of existing sparse subnetworks within SNNs using the PQ index, which facilitates an adaptive determination of the rewiring ratio for synaptic connections based on data compression insights. In the second stage, this rewiring ratio critically informs the dynamic synaptic connection rewiring process, including both pruning and regrowth. This approach significantly improves the exploration of sparse structure training in deep SNNs, adapting sparsity dynamically from the point view of compression efficiency. Our experiments demonstrate that this sparse training approach not only aligns with the performance of current deep SNNs models but also significantly improves the efficiency of compressing sparse SNNs. Crucially, it preserves the advantages of initiating training with sparse models and offers a promising solution for implementing edge AI on neuromorphic hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13572v1-abstract-full').style.display = 'none'; document.getElementById('2502.13572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13481">arXiv:2502.13481</a> <span> [<a href="https://arxiv.org/pdf/2502.13481">pdf</a>, <a href="https://arxiv.org/format/2502.13481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLM4Tag: Automatic Tagging System for Information Retrieval via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weipeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Menghui Zhu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xinyi Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Huifeng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13481v1-abstract-short" style="display: inline;"> Tagging systems play an essential role in various information retrieval applications such as search engines and recommender systems. Recently, Large Language Models (LLMs) have been applied in tagging systems due to their extensive world knowledge, semantic understanding, and reasoning capabilities. Despite achieving remarkable performance, existing methods still have limitations, including diffic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13481v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13481v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13481v1-abstract-full" style="display: none;"> Tagging systems play an essential role in various information retrieval applications such as search engines and recommender systems. Recently, Large Language Models (LLMs) have been applied in tagging systems due to their extensive world knowledge, semantic understanding, and reasoning capabilities. Despite achieving remarkable performance, existing methods still have limitations, including difficulties in retrieving relevant candidate tags comprehensively, challenges in adapting to emerging domain-specific knowledge, and the lack of reliable tag confidence quantification. To address these three limitations above, we propose an automatic tagging system LLM4Tag. First, a graph-based tag recall module is designed to effectively and comprehensively construct a small-scale highly relevant candidate tag set. Subsequently, a knowledge-enhanced tag generation module is employed to generate accurate tags with long-term and short-term knowledge injection. Finally, a tag confidence calibration module is introduced to generate reliable tag confidence scores. Extensive experiments over three large-scale industrial datasets show that LLM4Tag significantly outperforms the state-of-the-art baselines and LLM4Tag has been deployed online for content tagging to serve hundreds of millions of users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13481v1-abstract-full').style.display = 'none'; document.getElementById('2502.13481v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13163">arXiv:2502.13163</a> <span> [<a href="https://arxiv.org/pdf/2502.13163">pdf</a>, <a href="https://arxiv.org/format/2502.13163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Fuzzing Open-Source Operating Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qicai Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zilong Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenzhuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bihuan Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">You Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haowen Jiang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+B">Bingkun Sun</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wenyun Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13163v2-abstract-short" style="display: inline;"> Vulnerabilities in open-source operating systems (OSs) pose substantial security risks to software systems, making their detection crucial. While fuzzing has been an effective vulnerability detection technique in various domains, OS fuzzing (OSF) faces unique challenges due to OS complexity and multi-layered interaction, and has not been comprehensively reviewed. Therefore, this work systematicall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13163v2-abstract-full').style.display = 'inline'; document.getElementById('2502.13163v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13163v2-abstract-full" style="display: none;"> Vulnerabilities in open-source operating systems (OSs) pose substantial security risks to software systems, making their detection crucial. While fuzzing has been an effective vulnerability detection technique in various domains, OS fuzzing (OSF) faces unique challenges due to OS complexity and multi-layered interaction, and has not been comprehensively reviewed. Therefore, this work systematically surveys the state-of-the-art OSF techniques, categorizes them based on the general fuzzing process, and investigates challenges specific to kernel, file system, driver, and hypervisor fuzzing. Finally, future research directions for OSF are discussed. GitHub: https://github.com/pghk13/Survey-OSF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13163v2-abstract-full').style.display = 'none'; document.getElementById('2502.13163v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">45 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12860">arXiv:2502.12860</a> <span> [<a href="https://arxiv.org/pdf/2502.12860">pdf</a>, <a href="https://arxiv.org/format/2502.12860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An Experimental Study of SOTA LiDAR Segmentation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bike Chen</a>, <a href="/search/cs?searchtype=author&query=Tikanm%C3%A4ki%2C+A">Antti Tikanm盲ki</a>, <a href="/search/cs?searchtype=author&query=R%C3%B6ning%2C+J">Juha R枚ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12860v1-abstract-short" style="display: inline;"> Point cloud segmentation (PCS) is to classify each point in point clouds. The task enables robots to parse their 3D surroundings and run autonomously. According to different point cloud representations, existing PCS models can be roughly divided into point-, voxel-, and range image-based models. However, no work has been found to report comprehensive comparisons among the state-of-the-art point-,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12860v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12860v1-abstract-full" style="display: none;"> Point cloud segmentation (PCS) is to classify each point in point clouds. The task enables robots to parse their 3D surroundings and run autonomously. According to different point cloud representations, existing PCS models can be roughly divided into point-, voxel-, and range image-based models. However, no work has been found to report comprehensive comparisons among the state-of-the-art point-, voxel-, and range image-based models from an application perspective, bringing difficulty in utilizing these models for real-world scenarios. In this paper, we provide thorough comparisons among the models by considering the LiDAR data motion compensation and the metrics of model parameters, max GPU memory allocated during testing, inference latency, frames per second, intersection-over-union (IoU) and mean IoU (mIoU) scores. The experimental results benefit engineers when choosing a reasonable PCS model for an application and inspire researchers in the PCS field to design more practical models for a real-world scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12860v1-abstract-full').style.display = 'none'; document.getElementById('2502.12860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">No comments</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12574">arXiv:2502.12574</a> <span> [<a href="https://arxiv.org/pdf/2502.12574">pdf</a>, <a href="https://arxiv.org/format/2502.12574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HeadInfer: Memory-Efficient LLM Inference by Head-wise Offloading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+C">Cheng Luo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hanshi Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinqi Xiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiawei Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12574v1-abstract-short" style="display: inline;"> Transformer-based large language models (LLMs) demonstrate impressive performance in long context generation. Extending the context length has disproportionately shifted the memory footprint of LLMs during inference to the key-value cache (KV cache). In this paper, we propose HEADINFER, which offloads the KV cache to CPU RAM while avoiding the need to fully store the KV cache for any transformer l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12574v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12574v1-abstract-full" style="display: none;"> Transformer-based large language models (LLMs) demonstrate impressive performance in long context generation. Extending the context length has disproportionately shifted the memory footprint of LLMs during inference to the key-value cache (KV cache). In this paper, we propose HEADINFER, which offloads the KV cache to CPU RAM while avoiding the need to fully store the KV cache for any transformer layer on the GPU. HEADINFER employs a fine-grained, head-wise offloading strategy, maintaining only selective attention heads KV cache on the GPU while computing attention output dynamically. Through roofline analysis, we demonstrate that HEADINFER maintains computational efficiency while significantly reducing memory footprint. We evaluate HEADINFER on the Llama-3-8B model with a 1-million-token sequence, reducing the GPU memory footprint of the KV cache from 128 GB to 1 GB and the total GPU memory usage from 207 GB to 17 GB, achieving a 92% reduction compared to BF16 baseline inference. Notably, HEADINFER enables 4-million-token inference with an 8B model on a single consumer GPU with 24GB memory (e.g., NVIDIA RTX 4090) without approximation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12574v1-abstract-full').style.display = 'none'; document.getElementById('2502.12574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11824">arXiv:2502.11824</a> <span> [<a href="https://arxiv.org/pdf/2502.11824">pdf</a>, <a href="https://arxiv.org/format/2502.11824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> M-ABSA: A Multilingual Dataset for Aspect-Based Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chengyan Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bolei Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yihong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+N">Ningyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanshu Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Baolan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Plank%2C+B">Barbara Plank</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yun Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11824v1-abstract-short" style="display: inline;"> Aspect-based sentiment analysis (ABSA) is a crucial task in information extraction and sentiment analysis, aiming to identify aspects with associated sentiment elements in text. However, existing ABSA datasets are predominantly English-centric, limiting the scope for multilingual evaluation and research. To bridge this gap, we present M-ABSA, a comprehensive dataset spanning 7 domains and 21 langu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11824v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11824v1-abstract-full" style="display: none;"> Aspect-based sentiment analysis (ABSA) is a crucial task in information extraction and sentiment analysis, aiming to identify aspects with associated sentiment elements in text. However, existing ABSA datasets are predominantly English-centric, limiting the scope for multilingual evaluation and research. To bridge this gap, we present M-ABSA, a comprehensive dataset spanning 7 domains and 21 languages, making it the most extensive multilingual parallel dataset for ABSA to date. Our primary focus is on triplet extraction, which involves identifying aspect terms, aspect categories, and sentiment polarities. The dataset is constructed through an automatic translation process with human review to ensure quality. We perform extensive experiments using various baselines to assess performance and compatibility on M-ABSA. Our empirical findings highlight that the dataset enables diverse evaluation tasks, such as multilingual and multi-domain transfer learning, and large language model evaluation, underscoring its inclusivity and its potential to drive advancements in multilingual ABSA research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11824v1-abstract-full').style.display = 'none'; document.getElementById('2502.11824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11271">arXiv:2502.11271</a> <span> [<a href="https://arxiv.org/pdf/2502.11271">pdf</a>, <a href="https://arxiv.org/format/2502.11271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> OctoTools: An Agentic Framework with Extensible Tools for Complex Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+P">Pan Lu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bowen Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sheng Liu</a>, <a href="/search/cs?searchtype=author&query=Thapa%2C+R">Rahul Thapa</a>, <a href="/search/cs?searchtype=author&query=Boen%2C+J">Joseph Boen</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">James Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11271v1-abstract-short" style="display: inline;"> Solving complex reasoning tasks may involve visual understanding, domain knowledge retrieval, numerical calculation, and multi-step reasoning. Existing methods augment large language models (LLMs) with external tools but are restricted to specialized domains, limited tool types, or require additional training data. In this paper, we introduce OctoTools, a training-free, user-friendly, and easily e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11271v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11271v1-abstract-full" style="display: none;"> Solving complex reasoning tasks may involve visual understanding, domain knowledge retrieval, numerical calculation, and multi-step reasoning. Existing methods augment large language models (LLMs) with external tools but are restricted to specialized domains, limited tool types, or require additional training data. In this paper, we introduce OctoTools, a training-free, user-friendly, and easily extensible open-source agentic framework designed to tackle complex reasoning across diverse domains. OctoTools introduces standardized tool cards to encapsulate tool functionality, a planner for both high-level and low-level planning, and an executor to carry out tool usage. We validate OctoTools' generality across 16 diverse tasks (including MathVista, MMLU-Pro, MedQA, and GAIA-Text), achieving substantial average accuracy gains of 9.3% over GPT-4o. Furthermore, OctoTools outperforms AutoGen, GPT-Functions and LangChain by up to 10.6% when given the same set of tools. Through comprehensive analysis and ablations, OctoTools demonstrates advantages in task planning, effective tool usage, and multi-step problem solving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11271v1-abstract-full').style.display = 'none'; document.getElementById('2502.11271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">89 pages, 18 figures. Project website: https://octotools.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10891">arXiv:2502.10891</a> <span> [<a href="https://arxiv.org/pdf/2502.10891">pdf</a>, <a href="https://arxiv.org/format/2502.10891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> AquaScope: Reliable Underwater Image Transmission on Mobile Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+B">Beitong Tian</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lingzhi Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Haozhen Zheng</a>, <a href="/search/cs?searchtype=author&query=Vasisht%2C+D">Deepak Vasisht</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F+Y">Francis Y. Yan</a>, <a href="/search/cs?searchtype=author&query=Nahrstedt%2C+K">Klara Nahrstedt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10891v1-abstract-short" style="display: inline;"> Underwater communication is essential for both recreational and scientific activities, such as scuba diving. However, existing methods remain highly constrained by environmental challenges and often require specialized hardware, driving research into more accessible underwater communication solutions. While recent acoustic-based communication systems support text messaging on mobile devices, their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10891v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10891v1-abstract-full" style="display: none;"> Underwater communication is essential for both recreational and scientific activities, such as scuba diving. However, existing methods remain highly constrained by environmental challenges and often require specialized hardware, driving research into more accessible underwater communication solutions. While recent acoustic-based communication systems support text messaging on mobile devices, their low data rates severely limit broader applications. We present AquaScope, the first acoustic communication system capable of underwater image transmission on commodity mobile devices. To address the key challenges of underwater environments -- limited bandwidth and high transmission errors -- AquaScope employs and enhances generative image compression to improve compression efficiency, and integrates it with reliability-enhancement techniques at the physical layer to strengthen error resilience. We implemented AquaScope on the Android platform and demonstrated its feasibility for underwater image transmission. Experimental results show that AquaScope enables reliable, low-latency image transmission while preserving perceptual image quality, across various bandwidth-constrained and error-prone underwater conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10891v1-abstract-full').style.display = 'none'; document.getElementById('2502.10891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 26 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09654">arXiv:2502.09654</a> <span> [<a href="https://arxiv.org/pdf/2502.09654">pdf</a>, <a href="https://arxiv.org/format/2502.09654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Mixture of Experts for Remote Sensing Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bowen Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Keyan Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mohan Yang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhengxia Zou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09654v1-abstract-short" style="display: inline;"> Remote sensing image super-resolution (SR) aims to reconstruct high-resolution remote sensing images from low-resolution inputs, thereby addressing limitations imposed by sensors and imaging conditions. However, the inherent characteristics of remote sensing images, including diverse ground object types and complex details, pose significant challenges to achieving high-quality reconstruction. Exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09654v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09654v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09654v1-abstract-full" style="display: none;"> Remote sensing image super-resolution (SR) aims to reconstruct high-resolution remote sensing images from low-resolution inputs, thereby addressing limitations imposed by sensors and imaging conditions. However, the inherent characteristics of remote sensing images, including diverse ground object types and complex details, pose significant challenges to achieving high-quality reconstruction. Existing methods typically employ a uniform structure to process various types of ground objects without distinction, making it difficult to adapt to the complex characteristics of remote sensing images. To address this issue, we introduce a Mixture of Experts (MoE) model and design a set of heterogeneous experts. These experts are organized into multiple expert groups, where experts within each group are homogeneous while being heterogeneous across groups. This design ensures that specialized activation parameters can be employed to handle the diverse and intricate details of ground objects effectively. To better accommodate the heterogeneous experts, we propose a multi-level feature aggregation strategy to guide the routing process. Additionally, we develop a dual-routing mechanism to adaptively select the optimal expert for each pixel. Experiments conducted on the UCMerced and AID datasets demonstrate that our proposed method achieves superior SR reconstruction accuracy compared to state-of-the-art methods. The code will be available at https://github.com/Mr-Bamboo/MFG-HMoE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09654v1-abstract-full').style.display = 'none'; document.getElementById('2502.09654v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08150">arXiv:2502.08150</a> <span> [<a href="https://arxiv.org/pdf/2502.08150">pdf</a>, <a href="https://arxiv.org/format/2502.08150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Force Matching with Relativistic Constraints: A Physics-Inspired Approach to Stable and Efficient Generative Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mingda Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08150v1-abstract-short" style="display: inline;"> This paper introduces Force Matching (ForM), a novel framework for generative modeling that represents an initial exploration into leveraging special relativistic mechanics to enhance the stability of the sampling process. By incorporating the Lorentz factor, ForM imposes a velocity constraint, ensuring that sample velocities remain bounded within a constant limit. This constraint serves as a fund… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08150v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08150v1-abstract-full" style="display: none;"> This paper introduces Force Matching (ForM), a novel framework for generative modeling that represents an initial exploration into leveraging special relativistic mechanics to enhance the stability of the sampling process. By incorporating the Lorentz factor, ForM imposes a velocity constraint, ensuring that sample velocities remain bounded within a constant limit. This constraint serves as a fundamental mechanism for stabilizing the generative dynamics, leading to a more robust and controlled sampling process. We provide a rigorous theoretical analysis demonstrating that the velocity constraint is preserved throughout the sampling procedure within the ForM framework. To validate the effectiveness of our approach, we conduct extensive empirical evaluations. On the \textit{half-moons} dataset, ForM significantly outperforms baseline methods, achieving the lowest Euclidean distance loss of \textbf{0.714}, in contrast to vanilla first-order flow matching (5.853) and first- and second-order flow matching (5.793). Additionally, we perform an ablation study to further investigate the impact of our velocity constraint, reaffirming the superiority of ForM in stabilizing the generative process. The theoretical guarantees and empirical results underscore the potential of integrating special relativity principles into generative modeling. Our findings suggest that ForM provides a promising pathway toward achieving stable, efficient, and flexible generative processes. This work lays the foundation for future advancements in high-dimensional generative modeling, opening new avenues for the application of physical principles in machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08150v1-abstract-full').style.display = 'none'; document.getElementById('2502.08150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07575">arXiv:2502.07575</a> <span> [<a href="https://arxiv.org/pdf/2502.07575">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Efficient and Multifaceted Computer-assisted Pronunciation Training Leveraging Hierarchical Selective State Space Model and Decoupled Cross-entropy Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chao%2C+F">Fu-An Chao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Berlin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07575v1-abstract-short" style="display: inline;"> Prior efforts in building computer-assisted pronunciation training (CAPT) systems often treat automatic pronunciation assessment (APA) and mispronunciation detection and diagnosis (MDD) as separate fronts: the former aims to provide multiple pronunciation aspect scores across diverse linguistic levels, while the latter focuses instead on pinpointing the precise phonetic pronunciation errors made b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07575v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07575v1-abstract-full" style="display: none;"> Prior efforts in building computer-assisted pronunciation training (CAPT) systems often treat automatic pronunciation assessment (APA) and mispronunciation detection and diagnosis (MDD) as separate fronts: the former aims to provide multiple pronunciation aspect scores across diverse linguistic levels, while the latter focuses instead on pinpointing the precise phonetic pronunciation errors made by non-native language learners. However, it is generally expected that a full-fledged CAPT system should perform both functionalities simultaneously and efficiently. In response to this surging demand, we in this work first propose HMamba, a novel CAPT approach that seamlessly integrates APA and MDD tasks in parallel. In addition, we introduce a novel loss function, decoupled cross-entropy loss (deXent), specifically tailored for MDD to facilitate better-supervised learning for detecting mispronounced phones, thereby enhancing overall performance. A comprehensive set of empirical results on the speechocean762 benchmark dataset demonstrates the effectiveness of our approach on APA. Notably, our proposed approach also yields a considerable improvement in MDD performance over a strong baseline, achieving an F1-score of 63.85%. Our codes are made available at https://github.com/Fuann/hmamba <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07575v1-abstract-full').style.display = 'none'; document.getElementById('2502.07575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025 Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06860">arXiv:2502.06860</a> <span> [<a href="https://arxiv.org/pdf/2502.06860">pdf</a>, <a href="https://arxiv.org/format/2502.06860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> AutoSketch: VLM-assisted Style-Aware Vector Sketch Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chin%2C+H">Hsiao-Yuan Chin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+I">I-Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+Y">Yi-Ting Chiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bing-Yu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06860v2-abstract-short" style="display: inline;"> The ability to automatically complete a partial sketch that depicts a complex scene, e.g., "a woman chatting with a man in the park", is very useful. However, existing sketch generation methods create sketches from scratch; they do not complete a partial sketch in the style of the original. To address this challenge, we introduce AutoSketch, a styleaware vector sketch completion method that accomm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06860v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06860v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06860v2-abstract-full" style="display: none;"> The ability to automatically complete a partial sketch that depicts a complex scene, e.g., "a woman chatting with a man in the park", is very useful. However, existing sketch generation methods create sketches from scratch; they do not complete a partial sketch in the style of the original. To address this challenge, we introduce AutoSketch, a styleaware vector sketch completion method that accommodates diverse sketch styles. Our key observation is that the style descriptions of a sketch in natural language preserve the style during automatic sketch completion. Thus, we use a pretrained vision-language model (VLM) to describe the styles of the partial sketches in natural language and replicate these styles using newly generated strokes. We initially optimize the strokes to match an input prompt augmented by style descriptions extracted from the VLM. Such descriptions allow the method to establish a diffusion prior in close alignment with that of the partial sketch. Next, we utilize the VLM to generate an executable style adjustment code that adjusts the strokes to conform to the desired style. We compare our method with existing methods across various sketch styles and prompts, performed extensive ablation studies and qualitative and quantitative evaluations, and demonstrate that AutoSketch can support various sketch scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06860v2-abstract-full').style.display = 'none'; document.getElementById('2502.06860v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06764">arXiv:2502.06764</a> <span> [<a href="https://arxiv.org/pdf/2502.06764">pdf</a>, <a href="https://arxiv.org/format/2502.06764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> History-Guided Video Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+K">Kiwhan Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Simchowitz%2C+M">Max Simchowitz</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yilun Du</a>, <a href="/search/cs?searchtype=author&query=Tedrake%2C+R">Russ Tedrake</a>, <a href="/search/cs?searchtype=author&query=Sitzmann%2C+V">Vincent Sitzmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06764v1-abstract-short" style="display: inline;"> Classifier-free guidance (CFG) is a key technique for improving conditional generation in diffusion models, enabling more accurate control while enhancing sample quality. It is natural to extend this technique to video diffusion, which generates video conditioned on a variable number of context frames, collectively referred to as history. However, we find two key challenges to guiding with variabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06764v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06764v1-abstract-full" style="display: none;"> Classifier-free guidance (CFG) is a key technique for improving conditional generation in diffusion models, enabling more accurate control while enhancing sample quality. It is natural to extend this technique to video diffusion, which generates video conditioned on a variable number of context frames, collectively referred to as history. However, we find two key challenges to guiding with variable-length history: architectures that only support fixed-size conditioning, and the empirical observation that CFG-style history dropout performs poorly. To address this, we propose the Diffusion Forcing Transformer (DFoT), a video diffusion architecture and theoretically grounded training objective that jointly enable conditioning on a flexible number of history frames. We then introduce History Guidance, a family of guidance methods uniquely enabled by DFoT. We show that its simplest form, vanilla history guidance, already significantly improves video generation quality and temporal consistency. A more advanced method, history guidance across time and frequency further enhances motion dynamics, enables compositional generalization to out-of-distribution history, and can stably roll out extremely long videos. Website: https://boyuan.space/history-guidance <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06764v1-abstract-full').style.display = 'none'; document.getElementById('2502.06764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Website: https://boyuan.space/history-guidance</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05431">arXiv:2502.05431</a> <span> [<a href="https://arxiv.org/pdf/2502.05431">pdf</a>, <a href="https://arxiv.org/format/2502.05431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> APE: Faster and Longer Context-Augmented Generation via Adaptive Parallel Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05431v2-abstract-short" style="display: inline;"> Context-augmented generation (CAG) techniques, including RAG and ICL, require the efficient combination of multiple contexts to generate responses to user queries. Directly inputting these contexts as a sequence introduces a considerable computational burden by re-encoding the combined selection of contexts for every request. To address this, we explore the promising potential of parallel encoding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05431v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05431v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05431v2-abstract-full" style="display: none;"> Context-augmented generation (CAG) techniques, including RAG and ICL, require the efficient combination of multiple contexts to generate responses to user queries. Directly inputting these contexts as a sequence introduces a considerable computational burden by re-encoding the combined selection of contexts for every request. To address this, we explore the promising potential of parallel encoding to independently pre-compute and cache each context's KV states. This approach enables the direct loading of cached states during inference while accommodating more contexts through position reuse across contexts. However, due to misalignments in attention distribution, directly applying parallel encoding results in a significant performance drop. To enable effective and efficient CAG, we propose Adaptive Parallel Encoding ($\textbf{APE}$), which brings shared prefix, attention temperature, and scaling factor to align the distribution of parallel encoding with sequential encoding. Results on RAG and ICL tasks demonstrate that APE can preserve 98% and 93% sequential encoding performance using the same inputs while outperforming parallel encoding by 3.6% and 7.9%, respectively. It also scales to many-shot CAG, effectively encoding hundreds of contexts in parallel. Efficiency evaluation shows that APE can achieve an end-to-end 4.5$\times$ speedup by reducing 28$\times$ prefilling time for a 128K-length context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05431v2-abstract-full').style.display = 'none'; document.getElementById('2502.05431v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05252">arXiv:2502.05252</a> <span> [<a href="https://arxiv.org/pdf/2502.05252">pdf</a>, <a href="https://arxiv.org/format/2502.05252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GSM-Infinite: How Do Your LLMs Behave over Infinitely Increasing Context Length and Reasoning Complexity? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongyi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuoming Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05252v1-abstract-short" style="display: inline;"> Long-context large language models (LLMs) have recently shown strong performance in information retrieval and long-document QA. However, to tackle the most challenging intellectual problems, LLMs must reason effectively in long and complex contexts (e.g., frontier mathematical research). Studying how LLMs handle increasing reasoning complexity and context length is essential, yet existing benchmar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05252v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05252v1-abstract-full" style="display: none;"> Long-context large language models (LLMs) have recently shown strong performance in information retrieval and long-document QA. However, to tackle the most challenging intellectual problems, LLMs must reason effectively in long and complex contexts (e.g., frontier mathematical research). Studying how LLMs handle increasing reasoning complexity and context length is essential, yet existing benchmarks lack a solid basis for quantitative evaluation. Inspired by the abstraction of GSM-8K problems as computational graphs, and the ability to introduce noise by adding unnecessary nodes and edges, we develop a grade school math problem generator capable of producing arithmetic problems with infinite difficulty and context length under fine-grained control. Using our newly synthesized GSM-Infinite benchmark, we comprehensively evaluate existing LLMs. We find a consistent sigmoid decline in reasoning performance as complexity increases, along with a systematic inference scaling trend: exponentially increasing inference computation yields only linear performance gains. These findings underscore the fundamental limitations of current long-context LLMs and the key challenges in scaling reasoning capabilities. Our GSM-Infinite benchmark provides a scalable and controllable testbed for systematically studying and advancing LLM reasoning in long and complex contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05252v1-abstract-full').style.display = 'none'; document.getElementById('2502.05252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04405">arXiv:2502.04405</a> <span> [<a href="https://arxiv.org/pdf/2502.04405">pdf</a>, <a href="https://arxiv.org/format/2502.04405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FAS: Fast ANN-SNN Conversion for Spiking Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaotian Song</a>, <a href="/search/cs?searchtype=author&query=Song%2C+A">Andy Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">BaDong Chen</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiancheng Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04405v1-abstract-short" style="display: inline;"> Spiking Large Language Models have been shown as a good alternative to LLMs in various scenarios. Existing methods for creating Spiking LLMs, i.e., direct training and ANN-SNN conversion, often suffer from performance degradation and relatively high computational costs. To address these issues, we propose a novel Fast ANN-SNN conversion strategy (FAS) that transforms LLMs into spiking LLMs in two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04405v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04405v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04405v1-abstract-full" style="display: none;"> Spiking Large Language Models have been shown as a good alternative to LLMs in various scenarios. Existing methods for creating Spiking LLMs, i.e., direct training and ANN-SNN conversion, often suffer from performance degradation and relatively high computational costs. To address these issues, we propose a novel Fast ANN-SNN conversion strategy (FAS) that transforms LLMs into spiking LLMs in two stages. The first stage employs a full-parameter fine-tuning of pre-trained models, so it does not need any direct training from scratch. The second stage introduces a coarse-to-fine calibration method to reduce conversion errors and improve accuracy. Our experiments on both language and vision-language tasks across four different scales of LLMs demonstrate that FAS can achieve state-of-the-art performance yet with significantly reduced inference latency and computational costs. For example, FAS only takes 8 timesteps to achieve an accuracy of 3% higher than that of the OPT-7B model, while reducing energy consumption by 96.63%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04405v1-abstract-full').style.display = 'none'; document.getElementById('2502.04405v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02853">arXiv:2502.02853</a> <span> [<a href="https://arxiv.org/pdf/2502.02853">pdf</a>, <a href="https://arxiv.org/format/2502.02853">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Latent Representations in Behavior Cloning: An Information Bottleneck Approach for Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shuanghao Bai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wanqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+P">Pengxiang Ding</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Donglin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Badong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02853v2-abstract-short" style="display: inline;"> Behavior Cloning (BC) is a widely adopted visual imitation learning method in robot manipulation. Current BC approaches often enhance generalization by leveraging large datasets and incorporating additional visual and textual modalities to capture more diverse information. However, these methods overlook whether the learned representations contain redundant information and lack a solid theoretical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02853v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02853v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02853v2-abstract-full" style="display: none;"> Behavior Cloning (BC) is a widely adopted visual imitation learning method in robot manipulation. Current BC approaches often enhance generalization by leveraging large datasets and incorporating additional visual and textual modalities to capture more diverse information. However, these methods overlook whether the learned representations contain redundant information and lack a solid theoretical foundation to guide the learning process. To address these limitations, we adopt an information-theoretic perspective and introduce mutual information to quantify and mitigate redundancy in latent representations. Building on this, we incorporate the Information Bottleneck (IB) principle into BC, which extends the idea of reducing redundancy by providing a structured framework for compressing irrelevant information while preserving task-relevant features. This work presents the first comprehensive study on redundancy in latent representations across various methods, backbones, and experimental settings, while extending the generalizability of the IB to BC. Extensive experiments and analyses on the CortexBench and LIBERO benchmarks demonstrate significant performance improvements with IB, underscoring the importance of reducing input data redundancy and highlighting its practical value for more practical applications. Project Page: https://baishuanghao.github.io/BC-IB.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02853v2-abstract-full').style.display = 'none'; document.getElementById('2502.02853v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02789">arXiv:2502.02789</a> <span> [<a href="https://arxiv.org/pdf/2502.02789">pdf</a>, <a href="https://arxiv.org/format/2502.02789">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Speculative Prefill: Turbocharging TTFT with Lightweight and Training-Free Token Importance Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Ce Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02789v1-abstract-short" style="display: inline;"> Improving time-to-first-token (TTFT) is an essentially important objective in modern large language model (LLM) inference engines. Because optimizing TTFT directly results in higher maximal QPS and meets the requirements of many critical applications. However, boosting TTFT is notoriously challenging since it is purely compute-bounded and the performance bottleneck shifts from the self-attention t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02789v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02789v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02789v1-abstract-full" style="display: none;"> Improving time-to-first-token (TTFT) is an essentially important objective in modern large language model (LLM) inference engines. Because optimizing TTFT directly results in higher maximal QPS and meets the requirements of many critical applications. However, boosting TTFT is notoriously challenging since it is purely compute-bounded and the performance bottleneck shifts from the self-attention to the MLP part. We present SpecPrefill, a training free framework that accelerates the inference TTFT for both long and medium context queries based on the following insight: LLMs are generalized enough to still preserve the quality given only a carefully chosen subset of prompt tokens. At its core, SpecPrefill leverages a lightweight model to speculate locally important tokens based on the context. These tokens, along with the necessary positional information, are then sent to the main model for processing. We evaluate SpecPrefill with a diverse set of tasks, followed by a comprehensive benchmarking of performance improvement both in a real end-to-end setting and ablation studies. SpecPrefill manages to serve Llama-3.1-405B-Instruct-FP8 with up to $7\times$ maximal end-to-end QPS on real downstream tasks and $7.66\times$ TTFT improvement during benchmarking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02789v1-abstract-full').style.display = 'none'; document.getElementById('2502.02789v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01578">arXiv:2502.01578</a> <span> [<a href="https://arxiv.org/pdf/2502.01578">pdf</a>, <a href="https://arxiv.org/format/2502.01578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ReGLA: Refining Gated Linear Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+P">Peng Lu</a>, <a href="/search/cs?searchtype=author&query=Kobyzev%2C+I">Ivan Kobyzev</a>, <a href="/search/cs?searchtype=author&query=Rezagholizadeh%2C+M">Mehdi Rezagholizadeh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boxing Chen</a>, <a href="/search/cs?searchtype=author&query=Langlais%2C+P">Philippe Langlais</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01578v2-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have set themselves apart with their exceptional performance in complex language modelling tasks. However, these models are also known for their significant computational and storage requirements, primarily due to the quadratic computation complexity of softmax attention. To mitigate this issue, linear attention has been designed to reduce the qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01578v2-abstract-full').style.display = 'inline'; document.getElementById('2502.01578v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01578v2-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have set themselves apart with their exceptional performance in complex language modelling tasks. However, these models are also known for their significant computational and storage requirements, primarily due to the quadratic computation complexity of softmax attention. To mitigate this issue, linear attention has been designed to reduce the quadratic space-time complexity that is inherent in standard transformers. In this work, we embarked on a comprehensive exploration of three key components that substantially impact the performance of the Gated Linear Attention module: feature maps, normalization, and the gating mechanism. We developed a feature mapping function to address some crucial issues that previous suggestions overlooked. Then we offered further rationale for the integration of normalization layers to stabilize the training process. Moreover, we explored the saturation phenomenon of the gating mechanism and augmented it with a refining module. We conducted extensive experiments and showed our architecture outperforms previous Gated Linear Attention mechanisms in extensive tasks including training from scratch and post-linearization with continual pre-training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01578v2-abstract-full').style.display = 'none'; document.getElementById('2502.01578v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025 (main)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00688">arXiv:2502.00688</a> <span> [<a href="https://arxiv.org/pdf/2502.00688">pdf</a>, <a href="https://arxiv.org/format/2502.00688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> High-Order Matching for One-Step Shortcut Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Chengyue Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mingda Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00688v1-abstract-short" style="display: inline;"> One-step shortcut diffusion models [Frans, Hafner, Levine and Abbeel, ICLR 2025] have shown potential in vision generation, but their reliance on first-order trajectory supervision is fundamentally limited. The Shortcut model's simplistic velocity-only approach fails to capture intrinsic manifold geometry, leading to erratic trajectories, poor geometric alignment, and instability-especially in hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00688v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00688v1-abstract-full" style="display: none;"> One-step shortcut diffusion models [Frans, Hafner, Levine and Abbeel, ICLR 2025] have shown potential in vision generation, but their reliance on first-order trajectory supervision is fundamentally limited. The Shortcut model's simplistic velocity-only approach fails to capture intrinsic manifold geometry, leading to erratic trajectories, poor geometric alignment, and instability-especially in high-curvature regions. These shortcomings stem from its inability to model mid-horizon dependencies or complex distributional features, leaving it ill-equipped for robust generative modeling. In this work, we introduce HOMO (High-Order Matching for One-Step Shortcut Diffusion), a game-changing framework that leverages high-order supervision to revolutionize distribution transportation. By incorporating acceleration, jerk, and beyond, HOMO not only fixes the flaws of the Shortcut model but also achieves unprecedented smoothness, stability, and geometric precision. Theoretically, we prove that HOMO's high-order supervision ensures superior approximation accuracy, outperforming first-order methods. Empirically, HOMO dominates in complex settings, particularly in high-curvature regions where the Shortcut model struggles. Our experiments show that HOMO delivers smoother trajectories and better distributional alignment, setting a new standard for one-step generative models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00688v1-abstract-full').style.display = 'none'; document.getElementById('2502.00688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18803">arXiv:2501.18803</a> <span> [<a href="https://arxiv.org/pdf/2501.18803">pdf</a>, <a href="https://arxiv.org/format/2501.18803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Deceptive Sequential Decision-Making via Regularized Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yerin Kim</a>, <a href="/search/cs?searchtype=author&query=Benvenuti%2C+A">Alexander Benvenuti</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Karabag%2C+M">Mustafa Karabag</a>, <a href="/search/cs?searchtype=author&query=Kulkarni%2C+A">Abhishek Kulkarni</a>, <a href="/search/cs?searchtype=author&query=Bastian%2C+N+D">Nathaniel D. Bastian</a>, <a href="/search/cs?searchtype=author&query=Topcu%2C+U">Ufuk Topcu</a>, <a href="/search/cs?searchtype=author&query=Hale%2C+M">Matthew Hale</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18803v1-abstract-short" style="display: inline;"> Autonomous systems are increasingly expected to operate in the presence of adversaries, though an adversary may infer sensitive information simply by observing a system, without even needing to interact with it. Therefore, in this work we present a deceptive decision-making framework that not only conceals sensitive information, but in fact actively misleads adversaries about it. We model autonomo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18803v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18803v1-abstract-full" style="display: none;"> Autonomous systems are increasingly expected to operate in the presence of adversaries, though an adversary may infer sensitive information simply by observing a system, without even needing to interact with it. Therefore, in this work we present a deceptive decision-making framework that not only conceals sensitive information, but in fact actively misleads adversaries about it. We model autonomous systems as Markov decision processes, and we consider adversaries that attempt to infer their reward functions using inverse reinforcement learning. To counter such efforts, we present two regularization strategies for policy synthesis problems that actively deceive an adversary about a system's underlying rewards. The first form of deception is ``diversionary'', and it leads an adversary to draw any false conclusion about what the system's reward function is. The second form of deception is ``targeted'', and it leads an adversary to draw a specific false conclusion about what the system's reward function is. We then show how each form of deception can be implemented in policy optimization problems, and we analytically bound the loss in total accumulated reward that is induced by deception. Next, we evaluate these developments in a multi-agent sequential decision-making problem with one real agent and multiple decoys. We show that diversionary deception can cause the adversary to believe that the most important agent is the least important, while attaining a total accumulated reward that is $98.83\%$ of its optimal, non-deceptive value. Similarly, we show that targeted deception can make any decoy appear to be the most important agent, while still attaining a total accumulated reward that is $99.25\%$ of its optimal, non-deceptive value. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18803v1-abstract-full').style.display = 'none'; document.getElementById('2501.18803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18539">arXiv:2501.18539</a> <span> [<a href="https://arxiv.org/pdf/2501.18539">pdf</a>, <a href="https://arxiv.org/format/2501.18539">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Can we Retrieve Everything All at Once? ARM: An Alignment-Oriented LLM-based Retrieval Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+P+B">Peter Baile Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Cafarella%2C+M">Michael Cafarella</a>, <a href="/search/cs?searchtype=author&query=Roth%2C+D">Dan Roth</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18539v1-abstract-short" style="display: inline;"> Real-world open-domain questions can be complicated, particularly when answering them involves information from multiple information sources. LLMs have demonstrated impressive performance in decomposing complex tasks into simpler steps, and previous work has used it for better retrieval in support of complex questions. However, LLM's decomposition of questions is unaware of what data is available… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18539v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18539v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18539v1-abstract-full" style="display: none;"> Real-world open-domain questions can be complicated, particularly when answering them involves information from multiple information sources. LLMs have demonstrated impressive performance in decomposing complex tasks into simpler steps, and previous work has used it for better retrieval in support of complex questions. However, LLM's decomposition of questions is unaware of what data is available and how data is organized, often leading to a sub-optimal retrieval performance. Recent effort in agentic RAG proposes to perform retrieval in an iterative fashion, where a followup query is derived as an action based on previous rounds of retrieval. While this provides one way of interacting with the data collection, agentic RAG's exploration of data is inefficient because successive queries depend on previous results rather than being guided by the organization of available data in the collection. To address this problem, we propose an LLM-based retrieval method -- ARM, that aims to better align the question with the organization of the data collection by exploring relationships among data objects beyond matching the utterance of the query, thus leading to a retrieve-all-at-once solution for complex queries. We evaluated ARM on two datasets, Bird and OTT-QA. On Bird, it outperforms standard RAG with query decomposition by up to 5.2 pt in execution accuracy and agentic RAG (ReAct) by up to 15.9 pt. On OTT-QA, it achieves up to 5.5 pt and 19.3 pt higher F1 match scores compared to these approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18539v1-abstract-full').style.display = 'none'; document.getElementById('2501.18539v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18124">arXiv:2501.18124</a> <span> [<a href="https://arxiv.org/pdf/2501.18124">pdf</a>, <a href="https://arxiv.org/format/2501.18124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> REMOTE: Real-time Ego-motion Tracking for Various Endoscopes via Multimodal Visual Feature Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+L">Liangjing Shao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Benshuang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuting Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinrong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18124v2-abstract-short" style="display: inline;"> Real-time ego-motion tracking for endoscope is a significant task for efficient navigation and robotic automation of endoscopy. In this paper, a novel framework is proposed to perform real-time ego-motion tracking for endoscope. Firstly, a multi-modal visual feature learning network is proposed to perform relative pose prediction, in which the motion feature from the optical flow, the scene featur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18124v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18124v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18124v2-abstract-full" style="display: none;"> Real-time ego-motion tracking for endoscope is a significant task for efficient navigation and robotic automation of endoscopy. In this paper, a novel framework is proposed to perform real-time ego-motion tracking for endoscope. Firstly, a multi-modal visual feature learning network is proposed to perform relative pose prediction, in which the motion feature from the optical flow, the scene features and the joint feature from two adjacent observations are all extracted for prediction. Due to more correlation information in the channel dimension of the concatenated image, a novel feature extractor is designed based on an attention mechanism to integrate multi-dimensional information from the concatenation of two continuous frames. To extract more complete feature representation from the fused features, a novel pose decoder is proposed to predict the pose transformation from the concatenated feature map at the end of the framework. At last, the absolute pose of endoscope is calculated based on relative poses. The experiment is conducted on three datasets of various endoscopic scenes and the results demonstrate that the proposed method outperforms state-of-the-art methods. Besides, the inference speed of the proposed method is over 30 frames per second, which meets the real-time requirement. The project page is here: remote-bmxs.netlify.app <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18124v2-abstract-full').style.display = 'none'; document.getElementById('2501.18124v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16256">arXiv:2501.16256</a> <span> [<a href="https://arxiv.org/pdf/2501.16256">pdf</a>, <a href="https://arxiv.org/format/2501.16256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving DBMS Scheduling Decisions with Fine-grained Performance Prediction on Concurrent Queries -- Extended </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziniu Wu</a>, <a href="/search/cs?searchtype=author&query=Markakis%2C+M">Markos Markakis</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunwei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P+B">Peter Baile Chen</a>, <a href="/search/cs?searchtype=author&query=Narayanaswamy%2C+B">Balakrishnan Narayanaswamy</a>, <a href="/search/cs?searchtype=author&query=Kraska%2C+T">Tim Kraska</a>, <a href="/search/cs?searchtype=author&query=Madden%2C+S">Samuel Madden</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16256v2-abstract-short" style="display: inline;"> Query scheduling is a critical task that directly impacts query performance in database management systems (DBMS). Deeply integrated schedulers, which require changes to DBMS internals, are usually customized for a specific engine and can take months to implement. In contrast, non-intrusive schedulers make coarse-grained decisions, such as controlling query admission and re-ordering query executio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16256v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16256v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16256v2-abstract-full" style="display: none;"> Query scheduling is a critical task that directly impacts query performance in database management systems (DBMS). Deeply integrated schedulers, which require changes to DBMS internals, are usually customized for a specific engine and can take months to implement. In contrast, non-intrusive schedulers make coarse-grained decisions, such as controlling query admission and re-ordering query execution, without requiring modifications to DBMS internals. They require much less engineering effort and can be applied across a wide range of DBMS engines, offering immediate benefits to end users. However, most existing non-intrusive scheduling systems rely on simplified cost models and heuristics that cannot accurately model query interactions under concurrency and different system states, possibly leading to suboptimal scheduling decisions. This work introduces IconqSched, a new, principled non-intrusive scheduler that optimizes the execution order and timing of queries to enhance total end-to-end runtime as experienced by the user query queuing time plus system runtime. Unlike previous approaches, IconqSched features a novel fine-grained predictor, Iconq, which treats the DBMS as a black box and accurately estimates the system runtime of concurrently executed queries under different system states. Using these predictions, IconqSched is able to capture system runtime variations across different query mixes and system loads. It then employs a greedy scheduling algorithm to effectively determine which queries to submit and when to submit them. We compare IconqSched to other schedulers in terms of end-to-end runtime using real workload traces. On Postgres, IconqSched reduces end-to-end runtime by 16.2%-28.2% on average and 33.6%-38.9% in the tail. Similarly, on Redshift, it reduces end-to-end runtime by 10.3%-14.1% on average and 14.9%-22.2% in the tail. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16256v2-abstract-full').style.display = 'none'; document.getElementById('2501.16256v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16103">arXiv:2501.16103</a> <span> [<a href="https://arxiv.org/pdf/2501.16103">pdf</a>, <a href="https://arxiv.org/ps/2501.16103">ps</a>, <a href="https://arxiv.org/format/2501.16103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Static Batching of Irregular Workloads on GPUs: Framework and Application to Efficient MoE Model Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinghan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiejing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bujiao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaotong Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lian Duan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yejun Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wente Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yajie Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiacheng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Laiwen Zheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyuan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16103v1-abstract-short" style="display: inline;"> It has long been a problem to arrange and execute irregular workloads on massively parallel devices. We propose a general framework for statically batching irregular workloads into a single kernel with a runtime task mapping mechanism on GPUs. We further apply this framework to Mixture-of-Experts (MoE) model inference and implement an optimized and efficient CUDA kernel. Our MoE kernel achieves up… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16103v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16103v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16103v1-abstract-full" style="display: none;"> It has long been a problem to arrange and execute irregular workloads on massively parallel devices. We propose a general framework for statically batching irregular workloads into a single kernel with a runtime task mapping mechanism on GPUs. We further apply this framework to Mixture-of-Experts (MoE) model inference and implement an optimized and efficient CUDA kernel. Our MoE kernel achieves up to 91% of the peak Tensor Core throughput on NVIDIA H800 GPU and 95% on NVIDIA H20 GPU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16103v1-abstract-full').style.display = 'none'; document.getElementById('2501.16103v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.1.3; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14577">arXiv:2501.14577</a> <span> [<a href="https://arxiv.org/pdf/2501.14577">pdf</a>, <a href="https://arxiv.org/format/2501.14577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ZETA: Leveraging Z-order Curves for Efficient Top-k Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qiuhao Zeng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jerry Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+P">Peng Lu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Gezheng Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boxing Chen</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Charles Ling</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14577v2-abstract-short" style="display: inline;"> Over recent years, the Transformer has become a fundamental building block for sequence modeling architectures. Yet at its core is the use of self-attention, whose memory and computational cost grow quadratically with the sequence length $N$, rendering it prohibitively expensive for long sequences. A promising approach is top-$k$ attention, which selects only the $k$ most relevant tokens and achie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14577v2-abstract-full').style.display = 'inline'; document.getElementById('2501.14577v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14577v2-abstract-full" style="display: none;"> Over recent years, the Transformer has become a fundamental building block for sequence modeling architectures. Yet at its core is the use of self-attention, whose memory and computational cost grow quadratically with the sequence length $N$, rendering it prohibitively expensive for long sequences. A promising approach is top-$k$ attention, which selects only the $k$ most relevant tokens and achieves performance comparable to vanilla self-attention while significantly reducing space and computational demands. However, causal masks require the current query token to only attend to past tokens, preventing the existing top-$k$ attention method from efficiently searching for the most relevant tokens in parallel, thereby limiting training efficiency. In this work, we propose ZETA, leveraging \textbf{Z}-Order Curves for \textbf{E}fficient \textbf{T}op-$k$ \textbf{A}ttention, to enable parallel querying of past tokens for entire sequences. % in both space and time complexity of $\mathcal{O}(N \log N)$. We first theoretically show that the choice of key and query dimensions involves a trade-off between the curse of dimensionality and the preservation of relative distances after projection. In light of this insight, we propose reducing the dimensionality of keys and queries in contrast to values and further leverage $Z$-order curves to map low-dimensional keys and queries into \emph{one}-dimensional space, which permits parallel sorting, thereby largely improving the efficiency for top-$k$ token selection. Experimental results demonstrate that ZETA matches the performance of standard attention on the synthetic \textsc{Multi-Query Associative Recall} task and outperforms attention and its variants on \textsc{Long Range Arena} and \textsc{WikiText-103} language modeling. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14577v2-abstract-full').style.display = 'none'; document.getElementById('2501.14577v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 4 figures, accepted in International Conference on Learning Representations (ICLR) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13344">arXiv:2501.13344</a> <span> [<a href="https://arxiv.org/pdf/2501.13344">pdf</a>, <a href="https://arxiv.org/format/2501.13344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Full-Stack Optimized Large Language Models for Lifelong Sequential Behavior Comprehension in Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shan%2C+R">Rong Shan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiachen Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13344v1-abstract-short" style="display: inline;"> In this paper, we address the lifelong sequential behavior incomprehension problem in large language models (LLMs) for recommendation, where LLMs struggle to extract useful information from long user behavior sequences, even within their context limits. To tackle this, we propose ReLLaX (Retrieval-enhanced Large Language models Plus), a framework offering optimization across data, prompt, and para… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13344v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13344v1-abstract-full" style="display: none;"> In this paper, we address the lifelong sequential behavior incomprehension problem in large language models (LLMs) for recommendation, where LLMs struggle to extract useful information from long user behavior sequences, even within their context limits. To tackle this, we propose ReLLaX (Retrieval-enhanced Large Language models Plus), a framework offering optimization across data, prompt, and parameter levels. At the data level, we introduce Semantic User Behavior Retrieval (SUBR) to reduce sequence heterogeneity, making it easier for LLMs to extract key information. For prompt-level enhancement, we employ Soft Prompt Augmentation (SPA) to inject collaborative knowledge, aligning item representations with recommendation tasks and improving LLMs's exploration of item relationships. Finally, at the parameter level, we propose Component Fully-interactive LoRA (CFLoRA), which enhances LoRA's expressiveness by enabling interactions between its components, allowing better capture of sequential information. Moreover, we present new perspectives to compare current LoRA-based LLM4Rec methods, i.e. from both a composite and a decomposed view. We theoretically demonstrate that the ways they employ LoRA for recommendation are degraded versions of our CFLoRA, with different constraints on atom component interactions. Extensive experiments on three public datasets demonstrate ReLLaX's superiority over existing baselines and its ability to mitigate lifelong sequential behavior incomprehension effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13344v1-abstract-full').style.display = 'none'; document.getElementById('2501.13344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13340">arXiv:2501.13340</a> <span> [<a href="https://arxiv.org/pdf/2501.13340">pdf</a>, <a href="https://arxiv.org/format/2501.13340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retrievals Can Be Detrimental: A Contrastive Backdoor Attack Paradigm on Retrieval-Augmented Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+X">Xiaohang Sui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongyao Yu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+J">Jiawei Kong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Sijin Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hao Wu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13340v1-abstract-short" style="display: inline;"> Diffusion models (DMs) have recently demonstrated remarkable generation capability. However, their training generally requires huge computational resources and large-scale datasets. To solve these, recent studies empower DMs with the advanced Retrieval-Augmented Generation (RAG) technique and propose retrieval-augmented diffusion models (RDMs). By incorporating rich knowledge from an auxiliary dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13340v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13340v1-abstract-full" style="display: none;"> Diffusion models (DMs) have recently demonstrated remarkable generation capability. However, their training generally requires huge computational resources and large-scale datasets. To solve these, recent studies empower DMs with the advanced Retrieval-Augmented Generation (RAG) technique and propose retrieval-augmented diffusion models (RDMs). By incorporating rich knowledge from an auxiliary database, RAG enhances diffusion models' generation and generalization ability while significantly reducing model parameters. Despite the great success, RAG may introduce novel security issues that warrant further investigation. In this paper, we reveal that the RDM is susceptible to backdoor attacks by proposing a multimodal contrastive attack approach named BadRDM. Our framework fully considers RAG's characteristics and is devised to manipulate the retrieved items for given text triggers, thereby further controlling the generated contents. Specifically, we first insert a tiny portion of images into the retrieval database as target toxicity surrogates. Subsequently, a malicious variant of contrastive learning is adopted to inject backdoors into the retriever, which builds shortcuts from triggers to the toxicity surrogates. Furthermore, we enhance the attacks through novel entropy-based selection and generative augmentation strategies that can derive better toxicity surrogates. Extensive experiments on two mainstream tasks demonstrate the proposed BadRDM achieves outstanding attack effects while preserving the model's benign utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13340v1-abstract-full').style.display = 'none'; document.getElementById('2501.13340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13312">arXiv:2501.13312</a> <span> [<a href="https://arxiv.org/pdf/2501.13312">pdf</a>, <a href="https://arxiv.org/format/2501.13312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Tensor-Var: Variational Data Assimilation in Tensor Product Feature Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiaoyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Giles%2C+D">Daniel Giles</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sibo Cheng</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yi He</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiao Xue</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boli Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yukun Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13312v2-abstract-short" style="display: inline;"> Variational data assimilation estimates the dynamical system states by minimizing a cost function that fits the numerical models with observational data. The widely used method, four-dimensional variational assimilation (4D-Var), has two primary challenges: (1) computationally demanding for complex nonlinear systems and (2) relying on state-observation mappings, which are often not perfectly known… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13312v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13312v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13312v2-abstract-full" style="display: none;"> Variational data assimilation estimates the dynamical system states by minimizing a cost function that fits the numerical models with observational data. The widely used method, four-dimensional variational assimilation (4D-Var), has two primary challenges: (1) computationally demanding for complex nonlinear systems and (2) relying on state-observation mappings, which are often not perfectly known. Deep learning (DL) has been used as a more expressive class of efficient model approximators to address these challenges. However, integrating such models into 4D-Var remains challenging due to their inherent nonlinearities and the lack of theoretical guarantees for consistency in assimilation results. In this paper, we propose Tensor-Var to address these challenges using kernel Conditional Mean Embedding (CME). Tensor-Var improves optimization efficiency by characterizing system dynamics and state-observation mappings as linear operators, leading to a convex cost function in the feature space. Furthermore, our method provides a new perspective to incorporate CME into 4D-Var, offering theoretical guarantees of consistent assimilation results between the original and feature spaces. To improve scalability, we propose a method to learn deep features (DFs) using neural networks within the Tensor-Var framework. Experiments on chaotic systems and global weather prediction with real-time observations show that Tensor-Var outperforms conventional and DL hybrid 4D-Var baselines in accuracy while achieving efficiency comparable to the static 3D-Var method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13312v2-abstract-full').style.display = 'none'; document.getElementById('2501.13312v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11284">arXiv:2501.11284</a> <span> [<a href="https://arxiv.org/pdf/2501.11284">pdf</a>, <a href="https://arxiv.org/format/2501.11284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RedStar: Does Scaling Long-CoT Data Unlock Better Slow-Reasoning Systems? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haotian Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xing Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weinong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongzhi Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+D">Da Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yi Hu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+S">Shijia Kang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiaming Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingying Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhijiang Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Debing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11284v1-abstract-short" style="display: inline;"> Can scaling transform reasoning? In this work, we explore the untapped potential of scaling Long Chain-of-Thought (Long-CoT) data to 1000k samples, pioneering the development of a slow-thinking model, RedStar. Through extensive experiments with various LLMs and different sizes, we uncover the ingredients for specialization and scale for Long-CoT training. Surprisingly, even smaller models show sig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11284v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11284v1-abstract-full" style="display: none;"> Can scaling transform reasoning? In this work, we explore the untapped potential of scaling Long Chain-of-Thought (Long-CoT) data to 1000k samples, pioneering the development of a slow-thinking model, RedStar. Through extensive experiments with various LLMs and different sizes, we uncover the ingredients for specialization and scale for Long-CoT training. Surprisingly, even smaller models show significant performance gains with limited data, revealing the sample efficiency of Long-CoT and the critical role of sample difficulty in the learning process. Our findings demonstrate that Long-CoT reasoning can be effectively triggered with just a few thousand examples, while larger models achieve unparalleled improvements. We also introduce reinforcement learning (RL)-scale training as a promising direction for advancing slow-thinking systems. RedStar shines across domains: on the MATH-Hard benchmark, RedStar-code-math boosts performance from 66.2\% to 81.6\%, and on the USA Math Olympiad (AIME), it solves 46.7\% of problems using only 21k mixed-code-math datasets. In multimodal tasks like GeoQA and MathVista-GEO, RedStar-Geo achieves competitive results with minimal Long-CoT data, outperforming other slow-thinking systems like QvQ-Preview. Compared to QwQ, RedStar strikes the perfect balance between reasoning and generalizability. Our work highlights that, with careful tuning, scaling Long-CoT can unlock extraordinary reasoning capabilities-even with limited dataset and set a new standard for slow-thinking models across diverse challenges. Our data and models are released at https://huggingface.co/RedStar-Reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11284v1-abstract-full').style.display = 'none'; document.getElementById('2501.11284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technique-report, https://huggingface.co/RedStar-Reasoning</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11091">arXiv:2501.11091</a> <span> [<a href="https://arxiv.org/pdf/2501.11091">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Bitcoin: A Non-Continuous Time System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11091v1-abstract-short" style="display: inline;"> In this paper, we explore the concept of time within Bitcoin's blockchain, which operates as a non-continuous time system. We focus on three core aspects that contribute to Bitcoin's time discontinuity: the random and distributed block generation process, the occurrence of forks and rollbacks that disrupt the linear progression of the blockchain, and the nature of transactions within this system,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11091v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11091v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11091v1-abstract-full" style="display: none;"> In this paper, we explore the concept of time within Bitcoin's blockchain, which operates as a non-continuous time system. We focus on three core aspects that contribute to Bitcoin's time discontinuity: the random and distributed block generation process, the occurrence of forks and rollbacks that disrupt the linear progression of the blockchain, and the nature of transactions within this system, which are subject to potential reordering or invalidation. These elements combine to create a time structure in Bitcoin that is fundamentally different from the continuous, linear time systems typically seen in traditional computing and physics. Additionally, the implications of this non-continuous time model for the future of decentralized technologies and their potential applications are discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11091v1-abstract-full').style.display = 'none'; document.getElementById('2501.11091v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09503">arXiv:2501.09503</a> <span> [<a href="https://arxiv.org/pdf/2501.09503">pdf</a>, <a href="https://arxiv.org/format/2501.09503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AnyStory: Towards Unified Single and Multiple Subject Personalization in Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Junjie He</a>, <a href="/search/cs?searchtype=author&query=Tuo%2C+Y">Yuxiang Tuo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Binghui Chen</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+C">Chongyang Zhong</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yifeng Geng</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09503v1-abstract-short" style="display: inline;"> Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity perso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09503v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09503v1-abstract-full" style="display: none;"> Recently, large-scale generative models have demonstrated outstanding text-to-image generation capabilities. However, generating high-fidelity personalized images with specific subjects still presents challenges, especially in cases involving multiple subjects. In this paper, we propose AnyStory, a unified approach for personalized subject generation. AnyStory not only achieves high-fidelity personalization for single subjects, but also for multiple subjects, without sacrificing subject fidelity. Specifically, AnyStory models the subject personalization problem in an "encode-then-route" manner. In the encoding step, AnyStory utilizes a universal and powerful image encoder, i.e., ReferenceNet, in conjunction with CLIP vision encoder to achieve high-fidelity encoding of subject features. In the routing step, AnyStory utilizes a decoupled instance-aware subject router to accurately perceive and predict the potential location of the corresponding subject in the latent space, and guide the injection of subject conditions. Detailed experimental results demonstrate the excellent performance of our method in retaining subject details, aligning text descriptions, and personalizing for multiple subjects. The project page is at https://aigcdesigngroup.github.io/AnyStory/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09503v1-abstract-full').style.display = 'none'; document.getElementById('2501.09503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech report; Project page: https://aigcdesigngroup.github.io/AnyStory/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09107">arXiv:2501.09107</a> <span> [<a href="https://arxiv.org/pdf/2501.09107">pdf</a>, <a href="https://arxiv.org/ps/2501.09107">ps</a>, <a href="https://arxiv.org/format/2501.09107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Post-Training Quantization: Introducing a Statistical Pre-Calibration Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ghaffari%2C+A">Alireza Ghaffari</a>, <a href="/search/cs?searchtype=author&query=Younesian%2C+S">Sharareh Younesian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boxing Chen</a>, <a href="/search/cs?searchtype=author&query=Nia%2C+V+P">Vahid Partovi Nia</a>, <a href="/search/cs?searchtype=author&query=Asgharian%2C+M">Masoud Asgharian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09107v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) become increasingly computationally complex, developing efficient deployment strategies, such as quantization, becomes crucial. State-of-the-art Post-training Quantization (PTQ) techniques often rely on calibration processes to maintain the accuracy of these models. However, while these calibration techniques can enhance performance in certain domains, they may not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09107v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09107v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) become increasingly computationally complex, developing efficient deployment strategies, such as quantization, becomes crucial. State-of-the-art Post-training Quantization (PTQ) techniques often rely on calibration processes to maintain the accuracy of these models. However, while these calibration techniques can enhance performance in certain domains, they may not be as effective in others. This paper aims to draw attention to robust statistical approaches that can mitigate such issues. We propose a weight-adaptive PTQ method that can be considered a precursor to calibration-based PTQ methods, guiding the quantization process to preserve the distribution of weights by minimizing the Kullback-Leibler divergence between the quantized weights and the originally trained weights. This minimization ensures that the quantized model retains the Shannon information content of the original model to a great extent, guaranteeing robust and efficient deployment across many tasks. As such, our proposed approach can perform on par with most common calibration-based PTQ methods, establishing a new pre-calibration step for further adjusting the quantized weights with calibration. We show that our pre-calibration results achieve the same accuracy as some existing calibration-based PTQ methods on various LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09107v1-abstract-full').style.display = 'none'; document.getElementById('2501.09107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07114">arXiv:2501.07114</a> <span> [<a href="https://arxiv.org/pdf/2501.07114">pdf</a>, <a href="https://arxiv.org/format/2501.07114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Duplex: Dual Prototype Learning for Compositional Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zhong Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yishi Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gerong Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenchao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07114v1-abstract-short" style="display: inline;"> Compositional Zero-Shot Learning (CZSL) aims to enable models to recognize novel compositions of visual states and objects that were absent during training. Existing methods predominantly focus on learning semantic representations of seen compositions but often fail to disentangle the independent features of states and objects in images, thereby limiting their ability to generalize to unseen compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07114v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07114v1-abstract-full" style="display: none;"> Compositional Zero-Shot Learning (CZSL) aims to enable models to recognize novel compositions of visual states and objects that were absent during training. Existing methods predominantly focus on learning semantic representations of seen compositions but often fail to disentangle the independent features of states and objects in images, thereby limiting their ability to generalize to unseen compositions. To address this challenge, we propose Duplex, a novel dual-prototype learning method that integrates semantic and visual prototypes through a carefully designed dual-branch architecture, enabling effective representation learning for compositional tasks. Duplex utilizes a Graph Neural Network (GNN) to adaptively update visual prototypes, capturing complex interactions between states and objects. Additionally, it leverages the strong visual-semantic alignment of pre-trained Vision-Language Models (VLMs) and employs a multi-path architecture combined with prompt engineering to align image and text representations, ensuring robust generalization. Extensive experiments on three benchmark datasets demonstrate that Duplex outperforms state-of-the-art methods in both closed-world and open-world settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07114v1-abstract-full').style.display = 'none'; document.getElementById('2501.07114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06566">arXiv:2501.06566</a> <span> [<a href="https://arxiv.org/pdf/2501.06566">pdf</a>, <a href="https://arxiv.org/format/2501.06566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Cooperative Aerial Robot Inspection Challenge: A Benchmark for Heterogeneous Multi-UAV Planning and Lessons Learned </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Muqing Cao</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thien-Minh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Anastasiou%2C+A">Andreas Anastasiou</a>, <a href="/search/cs?searchtype=author&query=Zacharia%2C+A">Angelos Zacharia</a>, <a href="/search/cs?searchtype=author&query=Papaioannou%2C+S">Savvas Papaioannou</a>, <a href="/search/cs?searchtype=author&query=Kolios%2C+P">Panayiotis Kolios</a>, <a href="/search/cs?searchtype=author&query=Panayiotou%2C+C+G">Christos G. Panayiotou</a>, <a href="/search/cs?searchtype=author&query=Polycarpou%2C+M+M">Marios M. Polycarpou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinhang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+F">Fei Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Boyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B+M">Ben M. Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lihua Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06566v2-abstract-short" style="display: inline;"> We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a simulation-based benchmark for motion planning algorithms in heterogeneous multi-UAV systems. CARIC features UAV teams with complementary sensors, realistic constraints, and evaluation metrics prioritizing inspection quality and efficiency. It offers a ready-to-use perception-control software stack and diverse scenarios to sup… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06566v2-abstract-full').style.display = 'inline'; document.getElementById('2501.06566v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06566v2-abstract-full" style="display: none;"> We propose the Cooperative Aerial Robot Inspection Challenge (CARIC), a simulation-based benchmark for motion planning algorithms in heterogeneous multi-UAV systems. CARIC features UAV teams with complementary sensors, realistic constraints, and evaluation metrics prioritizing inspection quality and efficiency. It offers a ready-to-use perception-control software stack and diverse scenarios to support the development and evaluation of task allocation and motion planning algorithms. Competitions using CARIC were held at IEEE CDC 2023 and the IROS 2024 Workshop on Multi-Robot Perception and Navigation, attracting innovative solutions from research teams worldwide. This paper examines the top three teams from CDC 2023, analyzing their exploration, inspection, and task allocation strategies while drawing insights into their performance across scenarios. The results highlight the task's complexity and suggest promising directions for future research in cooperative multi-UAV systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06566v2-abstract-full').style.display = 'none'; document.getElementById('2501.06566v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Please find our website at https://ntu-aris.github.io/caric</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05961">arXiv:2501.05961</a> <span> [<a href="https://arxiv.org/pdf/2501.05961">pdf</a>, <a href="https://arxiv.org/format/2501.05961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Swin-X2S: Reconstructing 3D Shape from 2D Biplanar X-ray with Swin Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kuan Liu</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+Z">Zongyuan Ying</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jie Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongyan Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+P">Ping Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjian Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jin Qi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yong Lu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Lianfu Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05961v1-abstract-short" style="display: inline;"> The conversion from 2D X-ray to 3D shape holds significant potential for improving diagnostic efficiency and safety. However, existing reconstruction methods often rely on hand-crafted features, manual intervention, and prior knowledge, resulting in unstable shape errors and additional processing costs. In this paper, we introduce Swin-X2S, an end-to-end deep learning method for directly reconstru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05961v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05961v1-abstract-full" style="display: none;"> The conversion from 2D X-ray to 3D shape holds significant potential for improving diagnostic efficiency and safety. However, existing reconstruction methods often rely on hand-crafted features, manual intervention, and prior knowledge, resulting in unstable shape errors and additional processing costs. In this paper, we introduce Swin-X2S, an end-to-end deep learning method for directly reconstructing 3D segmentation and labeling from 2D biplanar orthogonal X-ray images. Swin-X2S employs an encoder-decoder architecture: the encoder leverages 2D Swin Transformer for X-ray information extraction, while the decoder employs 3D convolution with cross-attention to integrate structural features from orthogonal views. A dimension-expanding module is introduced to bridge the encoder and decoder, ensuring a smooth conversion from 2D pixels to 3D voxels. We evaluate proposed method through extensive qualitative and quantitative experiments across nine publicly available datasets covering four anatomies (femur, hip, spine, and rib), with a total of 54 categories. Significant improvements over previous methods have been observed not only in the segmentation and labeling metrics but also in the clinically relevant parameters that are of primary concern in practical applications, which demonstrates the promise of Swin-X2S to provide an effective option for anatomical shape reconstruction in clinical scenarios. Code implementation is available at: \url{https://github.com/liukuan5625/Swin-X2S}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05961v1-abstract-full').style.display = 'none'; document.getElementById('2501.05961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03220">arXiv:2501.03220</a> <span> [<a href="https://arxiv.org/pdf/2501.03220">pdf</a>, <a href="https://arxiv.org/format/2501.03220">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProTracker: Probabilistic Integration for Robust and Accurate Point Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingzhe Gao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jiahui Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Baoquan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03220v1-abstract-short" style="display: inline;"> In this paper, we propose ProTracker, a novel framework for robust and accurate long-term dense tracking of arbitrary points in videos. The key idea of our method is incorporating probabilistic integration to refine multiple predictions from both optical flow and semantic features for robust short-term and long-term tracking. Specifically, we integrate optical flow estimations in a probabilistic m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03220v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03220v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03220v1-abstract-full" style="display: none;"> In this paper, we propose ProTracker, a novel framework for robust and accurate long-term dense tracking of arbitrary points in videos. The key idea of our method is incorporating probabilistic integration to refine multiple predictions from both optical flow and semantic features for robust short-term and long-term tracking. Specifically, we integrate optical flow estimations in a probabilistic manner, producing smooth and accurate trajectories by maximizing the likelihood of each prediction. To effectively re-localize challenging points that disappear and reappear due to occlusion, we further incorporate long-term feature correspondence into our flow predictions for continuous trajectory generation. Extensive experiments show that ProTracker achieves the state-of-the-art performance among unsupervised and self-supervised approaches, and even outperforms supervised methods on several benchmarks. Our code and model will be publicly available upon publication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03220v1-abstract-full').style.display = 'none'; document.getElementById('2501.03220v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://michaelszj.github.io/protracker</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00707">arXiv:2501.00707</a> <span> [<a href="https://arxiv.org/pdf/2501.00707">pdf</a>, <a href="https://arxiv.org/format/2501.00707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Everywhere Attack: Attacking Locally and Globally to Boost Targeted Transferability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Hui Zeng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Sanshuai Cui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Biwei Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+A">Anjie Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00707v1-abstract-short" style="display: inline;"> Adversarial examples' (AE) transferability refers to the phenomenon that AEs crafted with one surrogate model can also fool other models. Notwithstanding remarkable progress in untargeted transferability, its targeted counterpart remains challenging. This paper proposes an everywhere scheme to boost targeted transferability. Our idea is to attack a victim image both globally and locally. We aim to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00707v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00707v1-abstract-full" style="display: none;"> Adversarial examples' (AE) transferability refers to the phenomenon that AEs crafted with one surrogate model can also fool other models. Notwithstanding remarkable progress in untargeted transferability, its targeted counterpart remains challenging. This paper proposes an everywhere scheme to boost targeted transferability. Our idea is to attack a victim image both globally and locally. We aim to optimize 'an army of targets' in every local image region instead of the previous works that optimize a high-confidence target in the image. Specifically, we split a victim image into non-overlap blocks and jointly mount a targeted attack on each block. Such a strategy mitigates transfer failures caused by attention inconsistency between surrogate and victim models and thus results in stronger transferability. Our approach is method-agnostic, which means it can be easily combined with existing transferable attacks for even higher transferability. Extensive experiments on ImageNet demonstrate that the proposed approach universally improves the state-of-the-art targeted attacks by a clear margin, e.g., the transferability of the widely adopted Logit attack can be improved by 28.8%-300%.We also evaluate the crafted AEs on a real-world platform: Google Cloud Vision. Results further support the superiority of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00707v1-abstract-full').style.display = 'none'; document.getElementById('2501.00707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures, 8 tables, accepted by 2025AAAI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20807">arXiv:2412.20807</a> <span> [<a href="https://arxiv.org/pdf/2412.20807">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Two Heads Are Better Than One: Averaging along Fine-Tuning to Improve Targeted Transferability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Hui Zeng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+S">Sanshuai Cui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Biwei Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+A">Anjie Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20807v1-abstract-short" style="display: inline;"> With much longer optimization time than that of untargeted attacks notwithstanding, the transferability of targeted attacks is still far from satisfactory. Recent studies reveal that fine-tuning an existing adversarial example (AE) in feature space can efficiently boost its targeted transferability. However, existing fine-tuning schemes only utilize the endpoint and ignore the valuable information… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20807v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20807v1-abstract-full" style="display: none;"> With much longer optimization time than that of untargeted attacks notwithstanding, the transferability of targeted attacks is still far from satisfactory. Recent studies reveal that fine-tuning an existing adversarial example (AE) in feature space can efficiently boost its targeted transferability. However, existing fine-tuning schemes only utilize the endpoint and ignore the valuable information in the fine-tuning trajectory. Noting that the vanilla fine-tuning trajectory tends to oscillate around the periphery of a flat region of the loss surface, we propose averaging over the fine-tuning trajectory to pull the crafted AE towards a more centered region. We compare the proposed method with existing fine-tuning schemes by integrating them with state-of-the-art targeted attacks in various attacking scenarios. Experimental results uphold the superiority of the proposed method in boosting targeted transferability. The code is available at github.com/zengh5/Avg_FT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20807v1-abstract-full').style.display = 'none'; document.getElementById('2412.20807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, accepted by 2025ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18588">arXiv:2412.18588</a> <span> [<a href="https://arxiv.org/pdf/2412.18588">pdf</a>, <a href="https://arxiv.org/format/2412.18588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Paragraph is All It Takes: Rich Robot Behaviors from Interacting, Trusted LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenMind"> OpenMind</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+S">Shaohong Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+A">Adam Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Homin Luo</a>, <a href="/search/cs?searchtype=author&query=Liphardt%2C+J">Jan Liphardt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18588v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are compact representations of all public knowledge of our physical environment and animal and human behaviors. The application of LLMs to robotics may offer a path to highly capable robots that perform well across most human tasks with limited or even zero tuning. Aside from increasingly sophisticated reasoning and task planning, networks of (suitably designed) LLMs o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18588v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18588v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are compact representations of all public knowledge of our physical environment and animal and human behaviors. The application of LLMs to robotics may offer a path to highly capable robots that perform well across most human tasks with limited or even zero tuning. Aside from increasingly sophisticated reasoning and task planning, networks of (suitably designed) LLMs offer ease of upgrading capabilities and allow humans to directly observe the robot's thinking. Here we explore the advantages, limitations, and particularities of using LLMs to control physical robots. The basic system consists of four LLMs communicating via a human language data bus implemented via web sockets and ROS2 message passing. Surprisingly, rich robot behaviors and good performance across different tasks could be achieved despite the robot's data fusion cycle running at only 1Hz and the central data bus running at the extremely limited rates of the human brain, of around 40 bits/s. The use of natural language for inter-LLM communication allowed the robot's reasoning and decision making to be directly observed by humans and made it trivial to bias the system's behavior with sets of rules written in plain English. These rules were immutably written into Ethereum, a global, public, and censorship resistant Turing-complete computer. We suggest that by using natural language as the data bus among interacting AIs, and immutable public ledgers to store behavior constraints, it is possible to build robots that combine unexpectedly rich performance, upgradability, and durable alignment with humans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18588v1-abstract-full').style.display = 'none'; document.getElementById('2412.18588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18241">arXiv:2412.18241</a> <span> [<a href="https://arxiv.org/pdf/2412.18241">pdf</a>, <a href="https://arxiv.org/format/2412.18241">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Automatic Graph Construction Framework based on Large Language Models for Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shan%2C+R">Rong Shan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Menghui Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18241v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have emerged as state-of-the-art methods to learn from graph-structured data for recommendation. However, most existing GNN-based recommendation methods focus on the optimization of model structures and learning strategies based on pre-defined graphs, neglecting the importance of the graph construction stage. Earlier works for graph construction usually rely on speciff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18241v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18241v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18241v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have emerged as state-of-the-art methods to learn from graph-structured data for recommendation. However, most existing GNN-based recommendation methods focus on the optimization of model structures and learning strategies based on pre-defined graphs, neglecting the importance of the graph construction stage. Earlier works for graph construction usually rely on speciffic rules or crowdsourcing, which are either too simplistic or too labor-intensive. Recent works start to utilize large language models (LLMs) to automate the graph construction, in view of their abundant open-world knowledge and remarkable reasoning capabilities. Nevertheless, they generally suffer from two limitations: (1) invisibility of global view (e.g., overlooking contextual information) and (2) construction inefficiency. To this end, we introduce AutoGraph, an automatic graph construction framework based on LLMs for recommendation. Specifically, we first use LLMs to infer the user preference and item knowledge, which is encoded as semantic vectors. Next, we employ vector quantization to extract the latent factors from the semantic vectors. The latent factors are then incorporated as extra nodes to link the user/item nodes, resulting in a graph with in-depth global-view semantics. We further design metapath-based message aggregation to effectively aggregate the semantic and collaborative information. The framework is model-agnostic and compatible with different backbone models. Extensive experiments on three real-world datasets demonstrate the efficacy and efffciency of AutoGraph compared to existing baseline methods. We have deployed AutoGraph in Huawei advertising platform, and gain a 2.69% improvement on RPM and a 7.31% improvement on eCPM in the online A/B test. Currently AutoGraph has been used as the main trafffc model, serving hundreds of millions of people. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18241v1-abstract-full').style.display = 'none'; document.getElementById('2412.18241v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16256">arXiv:2412.16256</a> <span> [<a href="https://arxiv.org/pdf/2412.16256">pdf</a>, <a href="https://arxiv.org/format/2412.16256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Aria-UI: Visual Grounding for GUI Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuhao Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongxu Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziyang Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bei Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junnan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16256v1-abstract-short" style="display: inline;"> Digital agents for automating tasks across different platforms by directly manipulating the GUIs are increasingly important. For these agents, grounding from language instructions to target elements remains a significant challenge due to reliance on HTML or AXTree inputs. In this paper, we introduce Aria-UI, a large multimodal model specifically designed for GUI grounding. Aria-UI adopts a pure-vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16256v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16256v1-abstract-full" style="display: none;"> Digital agents for automating tasks across different platforms by directly manipulating the GUIs are increasingly important. For these agents, grounding from language instructions to target elements remains a significant challenge due to reliance on HTML or AXTree inputs. In this paper, we introduce Aria-UI, a large multimodal model specifically designed for GUI grounding. Aria-UI adopts a pure-vision approach, eschewing reliance on auxiliary inputs. To adapt to heterogeneous planning instructions, we propose a scalable data pipeline that synthesizes diverse and high-quality instruction samples for grounding. To handle dynamic contexts in task performing, Aria-UI incorporates textual and text-image interleaved action histories, enabling robust context-aware reasoning for grounding. Aria-UI sets new state-of-the-art results across offline and online agent benchmarks, outperforming both vision-only and AXTree-reliant baselines. We release all training data and model checkpoints to foster further research at https://ariaui.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16256v1-abstract-full').style.display = 'none'; document.getElementById('2412.16256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15838">arXiv:2412.15838</a> <span> [<a href="https://arxiv.org/pdf/2412.15838">pdf</a>, <a href="https://arxiv.org/format/2412.15838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Align Anything: Training All-Modality Models to Follow Instructions with Language Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+J">Jiaming Ji</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiayi Zhou</a>, <a href="/search/cs?searchtype=author&query=Lou%2C+H">Hantao Lou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Donghai Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuyao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenqi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaile Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+R">Rui Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mohan Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Josef Dai</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+T">Tianyi Qiu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hua Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jun Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15838v2-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) has proven effective in enhancing the instruction-following capabilities of large language models; however, it remains underexplored in the cross-modality domain. As the number of modalities increases, aligning all-modality models with human intentions -- such as instruction following -- becomes a pressing challenge. In this work, we make the first… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15838v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15838v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15838v2-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) has proven effective in enhancing the instruction-following capabilities of large language models; however, it remains underexplored in the cross-modality domain. As the number of modalities increases, aligning all-modality models with human intentions -- such as instruction following -- becomes a pressing challenge. In this work, we make the first attempt to fine-tune all-modality models (i.e. input and output with any modality, also named any-to-any models) using human preference data across all modalities (including text, image, audio, and video), ensuring its behavior aligns with human intentions. This endeavor presents several challenges. First, there is no large-scale all-modality human preference data in existing open-source resources, as most datasets are limited to specific modalities, predominantly text and image. Secondly, the effectiveness of binary preferences in RLHF for post-training alignment in complex all-modality scenarios remains an unexplored area. Finally, there is a lack of a systematic framework to evaluate the capabilities of all-modality models, particularly regarding modality selection and synergy. To address these challenges, we propose the align-anything framework, which includes meticulously annotated 200k all-modality human preference data. Then, we introduce an alignment method that learns from unified language feedback, effectively capturing complex modality-specific human preferences and enhancing the model's instruction-following capabilities. Furthermore, to assess performance improvements in all-modality models after post-training alignment, we construct a challenging all-modality capability evaluation framework -- eval-anything. All data, models, and code frameworks have been open-sourced for the community. For more details, please refer to https://github.com/PKU-Alignment/align-anything. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15838v2-abstract-full').style.display = 'none'; document.getElementById('2412.15838v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15677">arXiv:2412.15677</a> <span> [<a href="https://arxiv.org/pdf/2412.15677">pdf</a>, <a href="https://arxiv.org/format/2412.15677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AI-generated Image Quality Assessment in Visual Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yu Tian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixuan Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Baoliang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hanwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15677v1-abstract-short" style="display: inline;"> Assessing the quality of artificial intelligence-generated images (AIGIs) plays a crucial role in their application in real-world scenarios. However, traditional image quality assessment (IQA) algorithms primarily focus on low-level visual perception, while existing IQA works on AIGIs overemphasize the generated content itself, neglecting its effectiveness in real-world applications. To bridge thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15677v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15677v1-abstract-full" style="display: none;"> Assessing the quality of artificial intelligence-generated images (AIGIs) plays a crucial role in their application in real-world scenarios. However, traditional image quality assessment (IQA) algorithms primarily focus on low-level visual perception, while existing IQA works on AIGIs overemphasize the generated content itself, neglecting its effectiveness in real-world applications. To bridge this gap, we propose AIGI-VC, a quality assessment database for AI-Generated Images in Visual Communication, which studies the communicability of AIGIs in the advertising field from the perspectives of information clarity and emotional interaction. The dataset consists of 2,500 images spanning 14 advertisement topics and 8 emotion types. It provides coarse-grained human preference annotations and fine-grained preference descriptions, benchmarking the abilities of IQA methods in preference prediction, interpretation, and reasoning. We conduct an empirical study of existing representative IQA methods and large multi-modal models on the AIGI-VC dataset, uncovering their strengths and weaknesses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15677v1-abstract-full').style.display = 'none'; document.getElementById('2412.15677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI-2025; Project page: https://github.com/ytian73/AIGI-VC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15182">arXiv:2412.15182</a> <span> [<a href="https://arxiv.org/pdf/2412.15182">pdf</a>, <a href="https://arxiv.org/format/2412.15182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> STRAP: Robot Sub-Trajectory Retrieval for Augmented Policy Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Memmel%2C+M">Marius Memmel</a>, <a href="/search/cs?searchtype=author&query=Berg%2C+J">Jacob Berg</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingqing Chen</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhishek Gupta</a>, <a href="/search/cs?searchtype=author&query=Francis%2C+J">Jonathan Francis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15182v1-abstract-short" style="display: inline;"> Robot learning is witnessing a significant increase in the size, diversity, and complexity of pre-collected datasets, mirroring trends in domains such as natural language processing and computer vision. Many robot learning methods treat such datasets as multi-task expert data and learn a multi-task, generalist policy by training broadly across them. Notably, while these generalist policies can imp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15182v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15182v1-abstract-full" style="display: none;"> Robot learning is witnessing a significant increase in the size, diversity, and complexity of pre-collected datasets, mirroring trends in domains such as natural language processing and computer vision. Many robot learning methods treat such datasets as multi-task expert data and learn a multi-task, generalist policy by training broadly across them. Notably, while these generalist policies can improve the average performance across many tasks, the performance of generalist policies on any one task is often suboptimal due to negative transfer between partitions of the data, compared to task-specific specialist policies. In this work, we argue for the paradigm of training policies during deployment given the scenarios they encounter: rather than deploying pre-trained policies to unseen problems in a zero-shot manner, we non-parametrically retrieve and train models directly on relevant data at test time. Furthermore, we show that many robotics tasks share considerable amounts of low-level behaviors and that retrieval at the "sub"-trajectory granularity enables significantly improved data utilization, generalization, and robustness in adapting policies to novel problems. In contrast, existing full-trajectory retrieval methods tend to underutilize the data and miss out on shared cross-task content. This work proposes STRAP, a technique for leveraging pre-trained vision foundation models and dynamic time warping to retrieve sub-sequences of trajectories from large training corpora in a robust fashion. STRAP outperforms both prior retrieval algorithms and multi-task learning methods in simulated and real experiments, showing the ability to scale to much larger offline datasets in the real world as well as the ability to learn robust control policies with just a handful of real-world demonstrations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15182v1-abstract-full').style.display = 'none'; document.getElementById('2412.15182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website at https://weirdlabuw.github.io/strap/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14603">arXiv:2412.14603</a> <span> [<a href="https://arxiv.org/pdf/2412.14603">pdf</a>, <a href="https://arxiv.org/format/2412.14603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> </div> </div> <p class="title is-5 mathjax"> Successive optimization of optics and post-processing with differentiable coherent PSF operator and field information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Z">Zheng Ren</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingwen Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenguan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiapu Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingkun Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Huajun Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14603v2-abstract-short" style="display: inline;"> Recently, the joint design of optical systems and downstream algorithms is showing significant potential. However, existing rays-described methods are limited to optimizing geometric degradation, making it difficult to fully represent the optical characteristics of complex, miniaturized lenses constrained by wavefront aberration or diffraction effects. In this work, we introduce a precise optical… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14603v2-abstract-full').style.display = 'inline'; document.getElementById('2412.14603v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14603v2-abstract-full" style="display: none;"> Recently, the joint design of optical systems and downstream algorithms is showing significant potential. However, existing rays-described methods are limited to optimizing geometric degradation, making it difficult to fully represent the optical characteristics of complex, miniaturized lenses constrained by wavefront aberration or diffraction effects. In this work, we introduce a precise optical simulation model, and every operation in pipeline is differentiable. This model employs a novel initial value strategy to enhance the reliability of intersection calculation on high aspherics. Moreover, it utilizes a differential operator to reduce memory consumption during coherent point spread function calculations. To efficiently address various degradation, we design a joint optimization procedure that leverages field information. Guided by a general restoration network, the proposed method not only enhances the image quality, but also successively improves the optical performance across multiple lenses that are already in professional level. This joint optimization pipeline offers innovative insights into the practical design of sophisticated optical systems and post-processing algorithms. The source code will be made publicly available at https://github.com/Zrr-ZJU/Successive-optimization <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14603v2-abstract-full').style.display = 'none'; document.getElementById('2412.14603v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14518">arXiv:2412.14518</a> <span> [<a href="https://arxiv.org/pdf/2412.14518">pdf</a>, <a href="https://arxiv.org/format/2412.14518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Efficient Self-Supervised Video Hashing with Selective State Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+N">Niu Lian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuting Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yan Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongbing Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shu-Tao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14518v1-abstract-short" style="display: inline;"> Self-supervised video hashing (SSVH) is a practical task in video indexing and retrieval. Although Transformers are predominant in SSVH for their impressive temporal modeling capabilities, they often suffer from computational and memory inefficiencies. Drawing inspiration from Mamba, an advanced state-space model, we explore its potential in SSVH to achieve a better balance between efficacy and ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14518v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14518v1-abstract-full" style="display: none;"> Self-supervised video hashing (SSVH) is a practical task in video indexing and retrieval. Although Transformers are predominant in SSVH for their impressive temporal modeling capabilities, they often suffer from computational and memory inefficiencies. Drawing inspiration from Mamba, an advanced state-space model, we explore its potential in SSVH to achieve a better balance between efficacy and efficiency. We introduce S5VH, a Mamba-based video hashing model with an improved self-supervised learning paradigm. Specifically, we design bidirectional Mamba layers for both the encoder and decoder, which are effective and efficient in capturing temporal relationships thanks to the data-dependent selective scanning mechanism with linear complexity. In our learning strategy, we transform global semantics in the feature space into semantically consistent and discriminative hash centers, followed by a center alignment loss as a global learning signal. Our self-local-global (SLG) paradigm significantly improves learning efficiency, leading to faster and better convergence. Extensive experiments demonstrate S5VH's improvements over state-of-the-art methods, superior transferability, and scalable advantages in inference efficiency. Code is available at https://github.com/gimpong/AAAI25-S5VH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14518v1-abstract-full').style.display = 'none'; document.getElementById('2412.14518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI'25. 9 pages, 5 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14480">arXiv:2412.14480</a> <span> [<a href="https://arxiv.org/pdf/2412.14480">pdf</a>, <a href="https://arxiv.org/format/2412.14480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GraphEQA: Using 3D Semantic Scene Graphs for Real-time Embodied Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saxena%2C+S">Saumya Saxena</a>, <a href="/search/cs?searchtype=author&query=Buchanan%2C+B">Blake Buchanan</a>, <a href="/search/cs?searchtype=author&query=Paxton%2C+C">Chris Paxton</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bingqing Chen</a>, <a href="/search/cs?searchtype=author&query=Vaskevicius%2C+N">Narunas Vaskevicius</a>, <a href="/search/cs?searchtype=author&query=Palmieri%2C+L">Luigi Palmieri</a>, <a href="/search/cs?searchtype=author&query=Francis%2C+J">Jonathan Francis</a>, <a href="/search/cs?searchtype=author&query=Kroemer%2C+O">Oliver Kroemer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14480v1-abstract-short" style="display: inline;"> In Embodied Question Answering (EQA), agents must explore and develop a semantic understanding of an unseen environment in order to answer a situated question with confidence. This remains a challenging problem in robotics, due to the difficulties in obtaining useful semantic representations, updating these representations online, and leveraging prior world knowledge for efficient exploration and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14480v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14480v1-abstract-full" style="display: none;"> In Embodied Question Answering (EQA), agents must explore and develop a semantic understanding of an unseen environment in order to answer a situated question with confidence. This remains a challenging problem in robotics, due to the difficulties in obtaining useful semantic representations, updating these representations online, and leveraging prior world knowledge for efficient exploration and planning. Aiming to address these limitations, we propose GraphEQA, a novel approach that utilizes real-time 3D metric-semantic scene graphs (3DSGs) and task relevant images as multi-modal memory for grounding Vision-Language Models (VLMs) to perform EQA tasks in unseen environments. We employ a hierarchical planning approach that exploits the hierarchical nature of 3DSGs for structured planning and semantic-guided exploration. Through experiments in simulation on the HM-EQA dataset and in the real world in home and office environments, we demonstrate that our method outperforms key baselines by completing EQA tasks with higher success rates and fewer planning steps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14480v1-abstract-full').style.display = 'none'; document.getElementById('2412.14480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://saumyasaxena.github.io/grapheqa</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13942">arXiv:2412.13942</a> <span> [<a href="https://arxiv.org/pdf/2412.13942">pdf</a>, <a href="https://arxiv.org/format/2412.13942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Rose by Any Other Name: LLM-Generated Explanations Are Good Proxies for Human Explanations to Collect Label Distributions on NLI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beiduo Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Siyao Peng</a>, <a href="/search/cs?searchtype=author&query=Korhonen%2C+A">Anna Korhonen</a>, <a href="/search/cs?searchtype=author&query=Plank%2C+B">Barbara Plank</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13942v1-abstract-short" style="display: inline;"> Disagreement in human labeling is ubiquitous, and can be captured in human judgment distributions (HJDs). Recent research has shown that explanations provide valuable information for understanding human label variation (HLV) and large language models (LLMs) can approximate HJD from a few human-provided label-explanation pairs. However, collecting explanations for every label is still time-consumin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13942v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13942v1-abstract-full" style="display: none;"> Disagreement in human labeling is ubiquitous, and can be captured in human judgment distributions (HJDs). Recent research has shown that explanations provide valuable information for understanding human label variation (HLV) and large language models (LLMs) can approximate HJD from a few human-provided label-explanation pairs. However, collecting explanations for every label is still time-consuming. This paper examines whether LLMs can be used to replace humans in generating explanations for approximating HJD. Specifically, we use LLMs as annotators to generate model explanations for a few given human labels. We test ways to obtain and combine these label-explanations with the goal to approximate human judgment distribution. We further compare the resulting human with model-generated explanations, and test automatic and human explanation selection. Our experiments show that LLM explanations are promising for NLI: to estimate HJD, generated explanations yield comparable results to human's when provided with human labels. Importantly, our results generalize from datasets with human explanations to i) datasets where they are not available and ii) challenging out-of-distribution test sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13942v1-abstract-full').style.display = 'none'; document.getElementById('2412.13942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 21 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository