CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 539 results for author: <span class="mathjax">Peng, X</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Peng%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Peng, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Peng%2C+X&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Peng, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14373">arXiv:2502.14373</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.14373">pdf</a>, <a href="https://arxiv.org/format/2502.14373">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CrossVTON: Mimicking the Logic Reasoning on Cross-category Virtual Try-on guided by Tri-zone Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+D">Donghao Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yujie Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xu Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+B">Boyuan Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+C">Chengming Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+T">Taisong Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yanwei Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14373v1-abstract-short" style="display: inline;"> Despite remarkable progress in image-based virtual try-on systems, generating realistic and robust fitting images for cross-category virtual try-on remains a challenging task. The primary difficulty arises from the absence of human-like reasoning, which involves addressing size mismatches between garments and models while recognizing and leveraging the distinct functionalities of various regions w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14373v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14373v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14373v1-abstract-full" style="display: none;"> Despite remarkable progress in image-based virtual try-on systems, generating realistic and robust fitting images for cross-category virtual try-on remains a challenging task. The primary difficulty arises from the absence of human-like reasoning, which involves addressing size mismatches between garments and models while recognizing and leveraging the distinct functionalities of various regions within the model images. To address this issue, we draw inspiration from human cognitive processes and disentangle the complex reasoning required for cross-category try-on into a structured framework. This framework systematically decomposes the model image into three distinct regions: try-on, reconstruction, and imagination zones. Each zone plays a specific role in accommodating the garment and facilitating realistic synthesis. To endow the model with robust reasoning capabilities for cross-category scenarios, we propose an iterative data constructor. This constructor encompasses diverse scenarios, including intra-category try-on, any-to-dress transformations (replacing any garment category with a dress), and dress-to-any transformations (replacing a dress with another garment category). Utilizing the generated dataset, we introduce a tri-zone priors generator that intelligently predicts the try-on, reconstruction, and imagination zones by analyzing how the input garment is expected to align with the model image. Guided by these tri-zone priors, our proposed method, CrossVTON, achieves state-of-the-art performance, surpassing existing baselines in both qualitative and quantitative evaluations. Notably, it demonstrates superior capability in handling cross-category virtual try-on, meeting the complex demands of real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14373v1-abstract-full').style.display = 'none'; document.getElementById('2502.14373v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13163">arXiv:2502.13163</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.13163">pdf</a>, <a href="https://arxiv.org/format/2502.13163">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Operating Systems">cs.OS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Fuzzing Open-Source Operating Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+K">Kun Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qicai Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Z">Zilong Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wenzhuo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bihuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">You Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Haowen Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+B">Bingkun Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenyun Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13163v2-abstract-short" style="display: inline;"> Vulnerabilities in open-source operating systems (OSs) pose substantial security risks to software systems, making their detection crucial. While fuzzing has been an effective vulnerability detection technique in various domains, OS fuzzing (OSF) faces unique challenges due to OS complexity and multi-layered interaction, and has not been comprehensively reviewed. Therefore, this work systematicall&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13163v2-abstract-full').style.display = 'inline'; document.getElementById('2502.13163v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13163v2-abstract-full" style="display: none;"> Vulnerabilities in open-source operating systems (OSs) pose substantial security risks to software systems, making their detection crucial. While fuzzing has been an effective vulnerability detection technique in various domains, OS fuzzing (OSF) faces unique challenges due to OS complexity and multi-layered interaction, and has not been comprehensively reviewed. Therefore, this work systematically surveys the state-of-the-art OSF techniques, categorizes them based on the general fuzzing process, and investigates challenges specific to kernel, file system, driver, and hypervisor fuzzing. Finally, future research directions for OSF are discussed. GitHub: https://github.com/pghk13/Survey-OSF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13163v2-abstract-full').style.display = 'none'; document.getElementById('2502.13163v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">45 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11433">arXiv:2502.11433</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11433">pdf</a>, <a href="https://arxiv.org/format/2502.11433">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Trading and Market Microstructure">q-fin.TR</span> </div> </div> <p class="title is-5 mathjax"> FLAG-Trader: Fusion LLM-Agent with Gradient-based Reinforcement Learning for Financial Trading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+G">Guojun Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhiyang Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Keyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yupeng Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haohang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yangyang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xueqing Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+M">Mingquan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+K+E">Kaleb E Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiao-Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ananiadou%2C+S">Sophia Ananiadou</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Q">Qianqian Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11433v3-abstract-short" style="display: inline;"> Large language models (LLMs) fine-tuned on multimodal financial data have demonstrated impressive reasoning capabilities in various financial tasks. However, they often struggle with multi-step, goal-oriented scenarios in interactive financial markets, such as trading, where complex agentic approaches are required to improve decision-making. To address this, we propose \textsc{FLAG-Trader}, a unif&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11433v3-abstract-full').style.display = 'inline'; document.getElementById('2502.11433v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11433v3-abstract-full" style="display: none;"> Large language models (LLMs) fine-tuned on multimodal financial data have demonstrated impressive reasoning capabilities in various financial tasks. However, they often struggle with multi-step, goal-oriented scenarios in interactive financial markets, such as trading, where complex agentic approaches are required to improve decision-making. To address this, we propose \textsc{FLAG-Trader}, a unified architecture integrating linguistic processing (via LLMs) with gradient-driven reinforcement learning (RL) policy optimization, in which a partially fine-tuned LLM acts as the policy network, leveraging pre-trained knowledge while adapting to the financial domain through parameter-efficient fine-tuning. Through policy gradient optimization driven by trading rewards, our framework not only enhances LLM performance in trading but also improves results on other financial-domain tasks. We present extensive empirical evidence to validate these enhancements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11433v3-abstract-full').style.display = 'none'; document.getElementById('2502.11433v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11407">arXiv:2502.11407</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.11407">pdf</a>, <a href="https://arxiv.org/format/2502.11407">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Gensor: A Graph-based Construction Tensor Compilation Method for Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hangda Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Diao%2C+B">Boyu Diao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wenxin Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaohui Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yongjun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11407v1-abstract-short" style="display: inline;"> High-performance deep learning depends on efficient tensor programs. In recent years, automatic tensor program optimization, also known as tensor compilation, has emerged as the primary approach to generating efficient tensor programs. However, how to generate kernels with higher performance in a shorter time is still the key challenge. In this paper, we present Gensor, a graph-based construction&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11407v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11407v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11407v1-abstract-full" style="display: none;"> High-performance deep learning depends on efficient tensor programs. In recent years, automatic tensor program optimization, also known as tensor compilation, has emerged as the primary approach to generating efficient tensor programs. However, how to generate kernels with higher performance in a shorter time is still the key challenge. In this paper, we present Gensor, a graph-based construction tensor compilation method for deep learning, to further improve the performance of construction tensor compilation. Unlike existing tree-based methods, Gensor abstracts construction space into a graph structure. Gensor then explores the construction space with Markov analysis. Gensor takes tensor programs as states and models scheduling primitives as transition actions between these states. Therefore, the process of tensor program construction optimization is abstracted as a graph traversal process. This approach expands the optimization space, improving operator performance while ensuring rapid optimization. Extensive experiments with typical operators demonstrate that Gensor significantly outperforms the state-of-the-art methods on GPUs for both cloud servers and edge devices. As a result, Gensor can generate operator kernels in seconds, with performance increasing by 18\% on average, reaching a maximum of 30\%. It also achieves high speedup for end-to-end models like ResNet-50 and GPT-2, with an average acceleration of 20\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11407v1-abstract-full').style.display = 'none'; document.getElementById('2502.11407v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08127">arXiv:2502.08127</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.08127">pdf</a>, <a href="https://arxiv.org/format/2502.08127">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Fino1: On the Transferability of Reasoning Enhanced LLMs to Finance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qian%2C+L">Lingfei Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+W">Weipeng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xueqing Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Q">Qianqian Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08127v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have shown strong general reasoning abilities, yet their effectiveness in financial reasoning remains underexplored. In this study, we comprehensively evaluate 16 powerful reasoning and general LLMs on three complex financial tasks involving financial text, tabular data, and equations, assessing numerical reasoning, tabular interpretation, financ&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08127v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08127v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08127v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have shown strong general reasoning abilities, yet their effectiveness in financial reasoning remains underexplored. In this study, we comprehensively evaluate 16 powerful reasoning and general LLMs on three complex financial tasks involving financial text, tabular data, and equations, assessing numerical reasoning, tabular interpretation, financial terminology comprehension, long-context processing, and equation-based problem solving. Our results show that while better datasets and pretraining improve financial reasoning, general enhancements like CoT fine-tuning do not always yield consistent gains. Moreover, all reasoning strategies face challenges in improving performance on long-context and multi-table tasks. To address these limitations, we develop a financial reasoning-enhanced model based on Llama-3.1-8B-Instruct, by CoT fine-tuning and reinforcement learning with domain-specific reasoning paths. Even with simple fine-tuning with one financial dataset, our model achieves a consistent 10% performance improvement across tasks, surpassing all 8B models and even Llama3-70B-Instruct and Llama3.1-70B-Instruct on average. Our results highlight the need for domain-specific adaptations in financial tasks, emphasizing future directions such as multi-table reasoning, long-context processing, and financial terminology comprehension. All our datasets, models, and codes are publicly available. Furthermore, we introduce a leaderboard for benchmarking future datasets and models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08127v1-abstract-full').style.display = 'none'; document.getElementById('2502.08127v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Ongoing work, 13 pages, 2 figures, 3 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06324">arXiv:2502.06324</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06324">pdf</a>, <a href="https://arxiv.org/format/2502.06324">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UniDemoir茅: Towards Universal Image Demoir茅ing with Data Generation and Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zemin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yujing Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xidong Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Yiu%2C+S+M">Siu Ming Yiu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yuexin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06324v1-abstract-short" style="display: inline;"> Image demoir茅ing poses one of the most formidable challenges in image restoration, primarily due to the unpredictable and anisotropic nature of moir茅 patterns. Limited by the quantity and diversity of training data, current methods tend to overfit to a single moir茅 domain, resulting in performance degradation for new domains and restricting their robustness in real-world applications. In this pape&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06324v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06324v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06324v1-abstract-full" style="display: none;"> Image demoir茅ing poses one of the most formidable challenges in image restoration, primarily due to the unpredictable and anisotropic nature of moir茅 patterns. Limited by the quantity and diversity of training data, current methods tend to overfit to a single moir茅 domain, resulting in performance degradation for new domains and restricting their robustness in real-world applications. In this paper, we propose a universal image demoir茅ing solution, UniDemoir茅, which has superior generalization capability. Notably, we propose innovative and effective data generation and synthesis methods that can automatically provide vast high-quality moir茅 images to train a universal demoir茅ing model. Our extensive experiments demonstrate the cutting-edge performance and broad potential of our approach for generalized image demoir茅ing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06324v1-abstract-full').style.display = 'none'; document.getElementById('2502.06324v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05853">arXiv:2502.05853</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05853">pdf</a>, <a href="https://arxiv.org/ps/2502.05853">ps</a>, <a href="https://arxiv.org/format/2502.05853">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Zak-Transform-Induced Optimal Sequences and Their Applications in OTFS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiuping Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Congying Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zilong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunlei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+P">Pingzhi Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05853v1-abstract-short" style="display: inline;"> This paper introduces a novel finite Zak transform (FZT)-aided framework for constructing multiple zero-correlation zone (ZCZ) sequence sets with optimal correlation properties. Specifically, each sequence is perfect with zero auto-correlation sidelobes, each ZCZ sequence set meets the Tang-Fan-Matsufuji bound with equality, and the maximum inter-set cross-correlation of multiple sequence sets mee&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05853v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05853v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05853v1-abstract-full" style="display: none;"> This paper introduces a novel finite Zak transform (FZT)-aided framework for constructing multiple zero-correlation zone (ZCZ) sequence sets with optimal correlation properties. Specifically, each sequence is perfect with zero auto-correlation sidelobes, each ZCZ sequence set meets the Tang-Fan-Matsufuji bound with equality, and the maximum inter-set cross-correlation of multiple sequence sets meets the Sarwate bound with equality. Our study shows that these sequences can be sparsely expressed in the Zak domain through properly selected index and phase matrices. Particularly, it is found that the maximum inter-set cross-correlation beats the Sarwate bound if every index matrix is a circular Florentine array. Several construction methods of multiple ZCZ sequence sets are proposed, demonstrating both the optimality and high flexibility. Additionally, it is shown that excellent synchronization performance can be achieved by the proposed sequences in orthogonal-time-frequency-space (OTFS) systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05853v1-abstract-full').style.display = 'none'; document.getElementById('2502.05853v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05275">arXiv:2502.05275</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05275">pdf</a>, <a href="https://arxiv.org/format/2502.05275">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Failure Detection with Human-Level Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K+X">Kien X. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05275v1-abstract-short" style="display: inline;"> Reliable failure detection holds paramount importance in safety-critical applications. Yet, neural networks are known to produce overconfident predictions for misclassified samples. As a result, it remains a problematic matter as existing confidence score functions rely on category-level signals, the logits, to detect failures. This research introduces an innovative strategy, leveraging human-leve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05275v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05275v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05275v1-abstract-full" style="display: none;"> Reliable failure detection holds paramount importance in safety-critical applications. Yet, neural networks are known to produce overconfident predictions for misclassified samples. As a result, it remains a problematic matter as existing confidence score functions rely on category-level signals, the logits, to detect failures. This research introduces an innovative strategy, leveraging human-level concepts for a dual purpose: to reliably detect when a model fails and to transparently interpret why. By integrating a nuanced array of signals for each category, our method enables a finer-grained assessment of the model&#39;s confidence. We present a simple yet highly effective approach based on the ordinal ranking of concept activation to the input image. Without bells and whistles, our method significantly reduce the false positive rate across diverse real-world image classification benchmarks, specifically by 3.7% on ImageNet and 9% on EuroSAT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05275v1-abstract-full').style.display = 'none'; document.getElementById('2502.05275v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04630">arXiv:2502.04630</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.04630">pdf</a>, <a href="https://arxiv.org/format/2502.04630">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> High-Speed Dynamic 3D Imaging with Sensor Fusion Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zou%2C+Z">Zihao Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+Z">Ziyuan Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Boominathan%2C+V">Vivek Boominathan</a>, <a href="/search/cs?searchtype=author&amp;query=Pediredla%2C+A">Adithya Pediredla</a>, <a href="/search/cs?searchtype=author&amp;query=Chakravarthula%2C+P">Praneeth Chakravarthula</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04630v1-abstract-short" style="display: inline;"> Capturing and reconstructing high-speed dynamic 3D scenes has numerous applications in computer graphics, vision, and interdisciplinary fields such as robotics, aerodynamics, and evolutionary biology. However, achieving this using a single imaging modality remains challenging. For instance, traditional RGB cameras suffer from low frame rates, limited exposure times, and narrow baselines. To addres&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04630v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04630v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04630v1-abstract-full" style="display: none;"> Capturing and reconstructing high-speed dynamic 3D scenes has numerous applications in computer graphics, vision, and interdisciplinary fields such as robotics, aerodynamics, and evolutionary biology. However, achieving this using a single imaging modality remains challenging. For instance, traditional RGB cameras suffer from low frame rates, limited exposure times, and narrow baselines. To address this, we propose a novel sensor fusion approach using Gaussian splatting, which combines RGB, depth, and event cameras to capture and reconstruct deforming scenes at high speeds. The key insight of our method lies in leveraging the complementary strengths of these imaging modalities: RGB cameras capture detailed color information, event cameras record rapid scene changes with microsecond resolution, and depth cameras provide 3D scene geometry. To unify the underlying scene representation across these modalities, we represent the scene using deformable 3D Gaussians. To handle rapid scene movements, we jointly optimize the 3D Gaussian parameters and their temporal deformation fields by integrating data from all three sensor modalities. This fusion enables efficient, high-quality imaging of fast and complex scenes, even under challenging conditions such as low light, narrow baselines, or rapid motion. Experiments on synthetic and real datasets captured with our prototype sensor fusion setup demonstrate that our method significantly outperforms state-of-the-art techniques, achieving noticeable improvements in both rendering fidelity and structural accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04630v1-abstract-full').style.display = 'none'; document.getElementById('2502.04630v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16566">arXiv:2501.16566</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.16566">pdf</a>, <a href="https://arxiv.org/format/2501.16566">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> AffectGPT: A New Dataset, Model, and Benchmark for Emotion Understanding with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lian%2C+Z">Zheng Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Haoyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Licai Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+Y">Yong Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+Z">Zebang Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+J">Jianhua Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16566v1-abstract-short" style="display: inline;"> The emergence of multimodal large language models (MLLMs) advances multimodal emotion recognition (MER) to the next level-from naive discriminative tasks to complex emotion understanding with advanced video understanding abilities and natural language description. However, the current community suffers from a lack of large-scale datasets with intensive, descriptive emotion annotations, as well as&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16566v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16566v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16566v1-abstract-full" style="display: none;"> The emergence of multimodal large language models (MLLMs) advances multimodal emotion recognition (MER) to the next level-from naive discriminative tasks to complex emotion understanding with advanced video understanding abilities and natural language description. However, the current community suffers from a lack of large-scale datasets with intensive, descriptive emotion annotations, as well as a multimodal-centric framework to maximize the potential of MLLMs for emotion understanding. To address this, we establish a new benchmark for MLLM-based emotion understanding with a novel dataset (MER-Caption), and a new model (AffectGPT). Utilizing our model-based crowd-sourcing data collection strategy, we construct the largest descriptive emotion dataset to date (by far), featuring over 2K fine-grained emotion categories across 115K samples. We also introduce the AffectGPT model, designed with pre-fusion operations to enhance multimodal integration. Finally, we present MER-UniBench, a unified benchmark with evaluation metrics tailored for both typical MER tasks and the free-form, natural language output style of MLLMs. Extensive experimental results demonstrate AffectGPT&#39;s robust performance across various MER tasks. We are publicly releasing both the AffectGPT model and the MER-Caption dataset to foster further research and development in emotion understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16566v1-abstract-full').style.display = 'none'; document.getElementById('2501.16566v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15055">arXiv:2501.15055</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.15055">pdf</a>, <a href="https://arxiv.org/format/2501.15055">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Group Ligands Docking to Protein Pockets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiahan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xingang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yunan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+J">Jian Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15055v1-abstract-short" style="display: inline;"> Molecular docking is a key task in computational biology that has attracted increasing interest from the machine learning community. While existing methods have achieved success, they generally treat each protein-ligand pair in isolation. Inspired by the biochemical observation that ligands binding to the same target protein tend to adopt similar poses, we propose \textsc{GroupBind}, a novel molec&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15055v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15055v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15055v1-abstract-full" style="display: none;"> Molecular docking is a key task in computational biology that has attracted increasing interest from the machine learning community. While existing methods have achieved success, they generally treat each protein-ligand pair in isolation. Inspired by the biochemical observation that ligands binding to the same target protein tend to adopt similar poses, we propose \textsc{GroupBind}, a novel molecular docking framework that simultaneously considers multiple ligands docking to a protein. This is achieved by introducing an interaction layer for the group of ligands and a triangle attention module for embedding protein-ligand and group-ligand pairs. By integrating our approach with diffusion-based docking model, we set a new S performance on the PDBBind blind docking benchmark, demonstrating the effectiveness of our proposed molecular docking paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15055v1-abstract-full').style.display = 'none'; document.getElementById('2501.15055v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, published in ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10266">arXiv:2501.10266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.10266">pdf</a>, <a href="https://arxiv.org/format/2501.10266">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MutualForce: Mutual-Aware Enhancement for 4D Radar-LiDAR 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiangyuan Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Huawei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Bierzynski%2C+K">Kay Bierzynski</a>, <a href="/search/cs?searchtype=author&amp;query=Fischbacher%2C+A">Anton Fischbacher</a>, <a href="/search/cs?searchtype=author&amp;query=Servadei%2C+L">Lorenzo Servadei</a>, <a href="/search/cs?searchtype=author&amp;query=Wille%2C+R">Robert Wille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10266v1-abstract-short" style="display: inline;"> Radar and LiDAR have been widely used in autonomous driving as LiDAR provides rich structure information, and radar demonstrates high robustness under adverse weather. Recent studies highlight the effectiveness of fusing radar and LiDAR point clouds. However, challenges remain due to the modality misalignment and information loss during feature extractions. To address these issues, we propose a 4D&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10266v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10266v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10266v1-abstract-full" style="display: none;"> Radar and LiDAR have been widely used in autonomous driving as LiDAR provides rich structure information, and radar demonstrates high robustness under adverse weather. Recent studies highlight the effectiveness of fusing radar and LiDAR point clouds. However, challenges remain due to the modality misalignment and information loss during feature extractions. To address these issues, we propose a 4D radar-LiDAR framework to mutually enhance their representations. Initially, the indicative features from radar are utilized to guide both radar and LiDAR geometric feature learning. Subsequently, to mitigate their sparsity gap, the shape information from LiDAR is used to enrich radar BEV features. Extensive experiments on the View-of-Delft (VoD) dataset demonstrate our approach&#39;s superiority over existing methods, achieving the highest mAP of 71.76% across the entire area and 86.36\% within the driving corridor. Especially for cars, we improve the AP by 4.17% and 4.20% due to the strong indicative features and symmetric shapes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10266v1-abstract-full').style.display = 'none'; document.getElementById('2501.10266v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05456">arXiv:2501.05456</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.05456">pdf</a>, <a href="https://arxiv.org/format/2501.05456">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> LLM Based Input Space Partitioning Testing for Library APIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiageng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+H">Haozhen You</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Cen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05456v1-abstract-short" style="display: inline;"> Automated library APIs testing is difficult as it requires exploring a vast space of parameter inputs that may involve objects with complex data types. Existing search based approaches, with limited knowledge of relations between object states and program branches, often suffer from the low efficiency issue, i.e., tending to generate invalid inputs. Symbolic execution based approaches can effectiv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05456v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05456v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05456v1-abstract-full" style="display: none;"> Automated library APIs testing is difficult as it requires exploring a vast space of parameter inputs that may involve objects with complex data types. Existing search based approaches, with limited knowledge of relations between object states and program branches, often suffer from the low efficiency issue, i.e., tending to generate invalid inputs. Symbolic execution based approaches can effectively identify such relations, but fail to scale to large programs. In this work, we present an LLM-based input space partitioning testing approach, LISP, for library APIs. The approach leverages LLMs to understand the code of a library API under test and perform input space partitioning based on its understanding and rich common knowledge. Specifically, we provide the signature and code of the API under test to LLMs, with the expectation of obtaining a text description of each input space partition of theAPI under test. Then, we generate inputs through employing the generated text description to sample inputs from each partition, ultimately resulting in test suites that systematically explore the program behavior of the API. We evaluate LISP on more than 2,205 library API methods taken from 10 popular open-source Java libraries (e.g.,apache/commonslang with 2.6k stars, guava with 48.8k stars on GitHub). Our experiment results show that LISP is effective in library API testing. It significantly outperforms state-of-the-art tool EvoSuite in terms of edge coverage. On average, LISP achieves 67.82% branch coverage, surpassing EvoSuite by 1.21 times. In total, LISP triggers 404 exceptions or errors in the experiments, and discovers 13 previously unknown vulnerabilities during evaluation, which have been assigned CVE IDs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05456v1-abstract-full').style.display = 'none'; document.getElementById('2501.05456v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 10 figures. Proceedings of the 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02116">arXiv:2501.02116</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.02116">pdf</a>, <a href="https://arxiv.org/format/2501.02116">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Humanoid Locomotion and Manipulation: Current Progress and Challenges in Control, Planning, and Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Z">Zhaoyuan Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+W">Wenlan Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Z">Zhaoming Xie</a>, <a href="/search/cs?searchtype=author&amp;query=McCrory%2C+S">Stephen McCrory</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+X">Xianyi Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Shamsah%2C+A">Abdulaziz Shamsah</a>, <a href="/search/cs?searchtype=author&amp;query=Griffin%2C+R">Robert Griffin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C+K">C. Karen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Kheddar%2C+A">Abderrahmane Kheddar</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X+B">Xue Bin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yuke Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+G">Guanya Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+Q">Quan Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+G">Gordon Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Huijun Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Ye Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02116v1-abstract-short" style="display: inline;"> Humanoid robots have great potential to perform various human-level skills. These skills involve locomotion, manipulation, and cognitive capabilities. Driven by advances in machine learning and the strength of existing model-based approaches, these capabilities have progressed rapidly, but often separately. Therefore, a timely overview of current progress and future trends in this fast-evolving fi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02116v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02116v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02116v1-abstract-full" style="display: none;"> Humanoid robots have great potential to perform various human-level skills. These skills involve locomotion, manipulation, and cognitive capabilities. Driven by advances in machine learning and the strength of existing model-based approaches, these capabilities have progressed rapidly, but often separately. Therefore, a timely overview of current progress and future trends in this fast-evolving field is essential. This survey first summarizes the model-based planning and control that have been the backbone of humanoid robotics for the past three decades. We then explore emerging learning-based methods, with a focus on reinforcement learning and imitation learning that enhance the versatility of loco-manipulation skills. We examine the potential of integrating foundation models with humanoid embodiments, assessing the prospects for developing generalist humanoid agents. In addition, this survey covers emerging research for whole-body tactile sensing that unlocks new humanoid skills that involve physical interactions. The survey concludes with a discussion of the challenges and future trends. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02116v1-abstract-full').style.display = 'none'; document.getElementById('2501.02116v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00861">arXiv:2501.00861</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.00861">pdf</a>, <a href="https://arxiv.org/format/2501.00861">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Alzheimer&#39;s disease detection based on large language model prompt engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+T">Tian Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaolan Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+F">Feng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00861v1-abstract-short" style="display: inline;"> In light of the growing proportion of older individuals in our society, the timely diagnosis of Alzheimer&#39;s disease has become a crucial aspect of healthcare. In this paper, we propose a non-invasive and cost-effective detection method based on speech technology. The method employs a pre-trained language model in conjunction with techniques such as prompt fine-tuning and conditional learning, ther&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00861v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00861v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00861v1-abstract-full" style="display: none;"> In light of the growing proportion of older individuals in our society, the timely diagnosis of Alzheimer&#39;s disease has become a crucial aspect of healthcare. In this paper, we propose a non-invasive and cost-effective detection method based on speech technology. The method employs a pre-trained language model in conjunction with techniques such as prompt fine-tuning and conditional learning, thereby enhancing the accuracy and efficiency of the detection process. To address the issue of limited computational resources, this study employs the efficient LORA fine-tuning method to construct the classification model. Following multiple rounds of training and rigorous 10-fold cross-validation, the prompt fine-tuning strategy based on the LLAMA2 model demonstrated an accuracy of 81.31\%, representing a 4.46\% improvement over the control group employing the BERT model. This study offers a novel technical approach for the early diagnosis of Alzheimer&#39;s disease and provides valuable insights into model optimization and resource utilization under similar conditions. It is anticipated that this method will prove beneficial in clinical practice and applied research, facilitating more accurate and efficient screening and diagnosis of Alzheimer&#39;s disease. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00861v1-abstract-full').style.display = 'none'; document.getElementById('2501.00861v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20404">arXiv:2412.20404</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.20404">pdf</a>, <a href="https://arxiv.org/format/2412.20404">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Sora: Democratizing Efficient Video Production for All </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zangwei Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiangyu Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+T">Tianji Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+C">Chenhui Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shenggui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongxin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianyi Li</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+Y">Yang You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20404v1-abstract-short" style="display: inline;"> Vision and language are the two foundational senses for humans, and they build up our cognitive ability and intelligence. While significant breakthroughs have been made in AI language ability, artificial visual intelligence, especially the ability to generate and simulate the world we see, is far lagging behind. To facilitate the development and accessibility of artificial visual intelligence, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20404v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20404v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20404v1-abstract-full" style="display: none;"> Vision and language are the two foundational senses for humans, and they build up our cognitive ability and intelligence. While significant breakthroughs have been made in AI language ability, artificial visual intelligence, especially the ability to generate and simulate the world we see, is far lagging behind. To facilitate the development and accessibility of artificial visual intelligence, we created Open-Sora, an open-source video generation model designed to produce high-fidelity video content. Open-Sora supports a wide spectrum of visual generation tasks, including text-to-image generation, text-to-video generation, and image-to-video generation. The model leverages advanced deep learning architectures and training/inference techniques to enable flexible video synthesis, which could generate video content of up to 15 seconds, up to 720p resolution, and arbitrary aspect ratios. Specifically, we introduce Spatial-Temporal Diffusion Transformer (STDiT), an efficient diffusion framework for videos that decouples spatial and temporal attention. We also introduce a highly compressive 3D autoencoder to make representations compact and further accelerate training with an ad hoc training strategy. Through this initiative, we aim to foster innovation, creativity, and inclusivity within the community of AI content creation. By embracing the open-source principle, Open-Sora democratizes full access to all the training/inference/data preparation codes as well as model weights. All resources are publicly available at: https://github.com/hpcaitech/Open-Sora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20404v1-abstract-full').style.display = 'none'; document.getElementById('2412.20404v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20066">arXiv:2412.20066</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.20066">pdf</a>, <a href="https://arxiv.org/format/2412.20066">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaIR: A Locality- and Continuity-Preserving Mamba for Image Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Boyun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Haiyu Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenxin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+P">Peng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Gou%2C+Y">Yuanbiao Gou</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20066v1-abstract-short" style="display: inline;"> Recent advancements in Mamba have shown promising results in image restoration. These methods typically flatten 2D images into multiple distinct 1D sequences along rows and columns, process each sequence independently using selective scan operation, and recombine them to form the outputs. However, such a paradigm overlooks two vital aspects: i) the local relationships and spatial continuity inhere&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20066v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20066v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20066v1-abstract-full" style="display: none;"> Recent advancements in Mamba have shown promising results in image restoration. These methods typically flatten 2D images into multiple distinct 1D sequences along rows and columns, process each sequence independently using selective scan operation, and recombine them to form the outputs. However, such a paradigm overlooks two vital aspects: i) the local relationships and spatial continuity inherent in natural images, and ii) the discrepancies among sequences unfolded through totally different ways. To overcome the drawbacks, we explore two problems in Mamba-based restoration methods: i) how to design a scanning strategy preserving both locality and continuity while facilitating restoration, and ii) how to aggregate the distinct sequences unfolded in totally different ways. To address these problems, we propose a novel Mamba-based Image Restoration model (MaIR), which consists of Nested S-shaped Scanning strategy (NSS) and Sequence Shuffle Attention block (SSA). Specifically, NSS preserves locality and continuity of the input images through the stripe-based scanning region and the S-shaped scanning path, respectively. SSA aggregates sequences through calculating attention weights within the corresponding channels of different sequences. Thanks to NSS and SSA, MaIR surpasses 40 baselines across 14 challenging datasets, achieving state-of-the-art performance on the tasks of image super-resolution, denoising, deblurring and dehazing. Our codes will be available after acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20066v1-abstract-full').style.display = 'none'; document.getElementById('2412.20066v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18838">arXiv:2412.18838</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.18838">pdf</a>, <a href="https://arxiv.org/format/2412.18838">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiFiC: Your Diffusion Model Holds the Secret to Fine-Grained Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+R">Ruohong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+P">Peng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiting Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yunfan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18838v1-abstract-short" style="display: inline;"> Fine-grained clustering is a practical yet challenging task, whose essence lies in capturing the subtle differences between instances of different classes. Such subtle differences can be easily disrupted by data augmentation or be overwhelmed by redundant information in data, leading to significant performance degradation for existing clustering methods. In this work, we introduce DiFiC a fine-gra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18838v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18838v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18838v1-abstract-full" style="display: none;"> Fine-grained clustering is a practical yet challenging task, whose essence lies in capturing the subtle differences between instances of different classes. Such subtle differences can be easily disrupted by data augmentation or be overwhelmed by redundant information in data, leading to significant performance degradation for existing clustering methods. In this work, we introduce DiFiC a fine-grained clustering method building upon the conditional diffusion model. Distinct from existing works that focus on extracting discriminative features from images, DiFiC resorts to deducing the textual conditions used for image generation. To distill more precise and clustering-favorable object semantics, DiFiC further regularizes the diffusion target and guides the distillation process utilizing neighborhood similarity. Extensive experiments demonstrate that DiFiC outperforms both state-of-the-art discriminative and generative clustering methods on four fine-grained image clustering benchmarks. We hope the success of DiFiC will inspire future research to unlock the potential of diffusion models in tasks beyond generation. The code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18838v1-abstract-full').style.display = 'none'; document.getElementById('2412.18838v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18174">arXiv:2412.18174</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.18174">pdf</a>, <a href="https://arxiv.org/format/2412.18174">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> </div> </div> <p class="title is-5 mathjax"> INVESTORBENCH: A Benchmark for Financial Decision-Making Tasks with LLM-based Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haohang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+Y">Yupeng Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yangyang Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Javaji%2C+S+R">Shashidhar Reddy Javaji</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhiyang Deng</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yueru He</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yuechen Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zining Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Subbalakshmi%2C+K">Koduvayur Subbalakshmi</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+G">Guojun Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+L">Lingfei Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xueqing Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Q">Qianqian Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Suchow%2C+J+W">Jordan W. Suchow</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18174v1-abstract-short" style="display: inline;"> Recent advancements have underscored the potential of large language model (LLM)-based agents in financial decision-making. Despite this progress, the field currently encounters two main challenges: (1) the lack of a comprehensive LLM agent framework adaptable to a variety of financial tasks, and (2) the absence of standardized benchmarks and consistent datasets for assessing agent performance. To&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18174v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18174v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18174v1-abstract-full" style="display: none;"> Recent advancements have underscored the potential of large language model (LLM)-based agents in financial decision-making. Despite this progress, the field currently encounters two main challenges: (1) the lack of a comprehensive LLM agent framework adaptable to a variety of financial tasks, and (2) the absence of standardized benchmarks and consistent datasets for assessing agent performance. To tackle these issues, we introduce \textsc{InvestorBench}, the first benchmark specifically designed for evaluating LLM-based agents in diverse financial decision-making contexts. InvestorBench enhances the versatility of LLM-enabled agents by providing a comprehensive suite of tasks applicable to different financial products, including single equities like stocks, cryptocurrencies and exchange-traded funds (ETFs). Additionally, we assess the reasoning and decision-making capabilities of our agent framework using thirteen different LLMs as backbone models, across various market environments and tasks. Furthermore, we have curated a diverse collection of open-source, multi-modal datasets and developed a comprehensive suite of environments for financial decision-making. This establishes a highly accessible platform for evaluating financial agents&#39; performance across various scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18174v1-abstract-full').style.display = 'none'; document.getElementById('2412.18174v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16654">arXiv:2412.16654</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16654">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IV-tuning: Parameter-Efficient Transfer Learning for Infrared-Visible Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yaming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+C">Chenqiang Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fangcen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Junjie Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xinggan Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+D">Deyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16654v1-abstract-short" style="display: inline;"> Infrared-visible (IR-VIS) tasks, such as semantic segmentation and object detection, greatly benefit from the advantage of combining infrared and visible modalities. To inherit the general representations of the Vision Foundation Models (VFMs), task-specific dual-branch networks are designed and fully fine-tuned on downstream datasets. Although effective, this manner lacks generality and is sub-op&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16654v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16654v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16654v1-abstract-full" style="display: none;"> Infrared-visible (IR-VIS) tasks, such as semantic segmentation and object detection, greatly benefit from the advantage of combining infrared and visible modalities. To inherit the general representations of the Vision Foundation Models (VFMs), task-specific dual-branch networks are designed and fully fine-tuned on downstream datasets. Although effective, this manner lacks generality and is sub-optimal due to the scarcity of downstream infrared-visible datasets and limited transferability. In this paper, we propose a novel and general fine-tuning approach, namely &#34;IV-tuning&#34;, to parameter-efficiently harness VFMs for various infrared-visible downstream tasks. At its core, IV-tuning freezes pre-trained visible-based VFMs and integrates modal-specific prompts with adapters within the backbone, bridging the gap between VFMs and downstream infrared-visible tasks while simultaneously learning the complementarity between different modalities. By fine-tuning approximately 3% of the backbone parameters, IV-tuning outperforms full fine-tuning across various baselines in infrared-visible semantic segmentation and object detection, as well as previous state-of-the-art methods. Extensive experiments across various settings demonstrate that IV-tuning achieves superior performance with fewer training parameters, providing a good alternative to full fine-tuning and a novel method of extending visible-based models for infrared-visible tasks. The code is available at https://github.com/Yummy198913/IV-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16654v1-abstract-full').style.display = 'none'; document.getElementById('2412.16654v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16380">arXiv:2412.16380</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.16380">pdf</a>, <a href="https://arxiv.org/format/2412.16380">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> LiRCDepth: Lightweight Radar-Camera Depth Estimation via Knowledge Distillation and Uncertainty Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Huawei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Vysotskaya%2C+N">Nastassia Vysotskaya</a>, <a href="/search/cs?searchtype=author&amp;query=Sukianto%2C+T">Tobias Sukianto</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+H">Hao Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Ott%2C+J">Julius Ott</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiangyuan Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Servadei%2C+L">Lorenzo Servadei</a>, <a href="/search/cs?searchtype=author&amp;query=Wille%2C+R">Robert Wille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16380v2-abstract-short" style="display: inline;"> Recently, radar-camera fusion algorithms have gained significant attention as radar sensors provide geometric information that complements the limitations of cameras. However, most existing radar-camera depth estimation algorithms focus solely on improving performance, often neglecting computational efficiency. To address this gap, we propose LiRCDepth, a lightweight radar-camera depth estimation&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16380v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16380v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16380v2-abstract-full" style="display: none;"> Recently, radar-camera fusion algorithms have gained significant attention as radar sensors provide geometric information that complements the limitations of cameras. However, most existing radar-camera depth estimation algorithms focus solely on improving performance, often neglecting computational efficiency. To address this gap, we propose LiRCDepth, a lightweight radar-camera depth estimation model. We incorporate knowledge distillation to enhance the training process, transferring critical information from a complex teacher model to our lightweight student model in three key domains. Firstly, low-level and high-level features are transferred by incorporating pixel-wise and pair-wise distillation. Additionally, we introduce an uncertainty-aware inter-depth distillation loss to refine intermediate depth maps during decoding. Leveraging our proposed knowledge distillation scheme, the lightweight model achieves a 6.6% improvement in MAE on the nuScenes dataset compared to the model trained without distillation. Code: https://github.com/harborsarah/LiRCDepth <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16380v2-abstract-full').style.display = 'none'; document.getElementById('2412.16380v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13333">arXiv:2412.13333</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.13333">pdf</a>, <a href="https://arxiv.org/format/2412.13333">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Accuracy: On the Effects of Fine-tuning Towards Vision-Language Model&#39;s Prediction Rationality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qitong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K+X">Kien X. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13333v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs), such as CLIP, have already seen widespread applications. Researchers actively engage in further fine-tuning VLMs in safety-critical domains. In these domains, prediction rationality is crucial: the prediction should be correct and based on valid evidence. Yet, for VLMs, the impact of fine-tuning on prediction rationality is seldomly investigated. To study this proble&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13333v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13333v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13333v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs), such as CLIP, have already seen widespread applications. Researchers actively engage in further fine-tuning VLMs in safety-critical domains. In these domains, prediction rationality is crucial: the prediction should be correct and based on valid evidence. Yet, for VLMs, the impact of fine-tuning on prediction rationality is seldomly investigated. To study this problem, we proposed two new metrics called Prediction Trustworthiness and Inference Reliability. We conducted extensive experiments on various settings and observed some interesting phenomena. On the one hand, we found that the well-adopted fine-tuning methods led to more correct predictions based on invalid evidence. This potentially undermines the trustworthiness of correct predictions from fine-tuned VLMs. On the other hand, having identified valid evidence of target objects, fine-tuned VLMs were more likely to make correct predictions. Moreover, the findings are also consistent under distributional shifts and across various experimental settings. We hope our research offer fresh insights to VLM fine-tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13333v1-abstract-full').style.display = 'none'; document.getElementById('2412.13333v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI), 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13196">arXiv:2412.13196</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.13196">pdf</a>, <a href="https://arxiv.org/format/2412.13196">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExBody2: Advanced Expressive Humanoid Whole-Body Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ji%2C+M">Mazeyu Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xuanbin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fangchen Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jialong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+X">Xuxin Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13196v1-abstract-short" style="display: inline;"> This paper enables real-world humanoid robots to maintain stability while performing expressive motions like humans do. We propose ExBody2, a generalized whole-body tracking framework that can take any reference motion inputs and control the humanoid to mimic the motion. The model is trained in simulation with Reinforcement Learning and then transferred to the real world. It decouples keypoint tra&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13196v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13196v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13196v1-abstract-full" style="display: none;"> This paper enables real-world humanoid robots to maintain stability while performing expressive motions like humans do. We propose ExBody2, a generalized whole-body tracking framework that can take any reference motion inputs and control the humanoid to mimic the motion. The model is trained in simulation with Reinforcement Learning and then transferred to the real world. It decouples keypoint tracking with velocity control, and effectively leverages a privileged teacher policy to distill precise mimic skills into the target student policy, which enables high-fidelity replication of dynamic movements such as running, crouching, dancing, and other challenging motions. We present a comprehensive qualitative and quantitative analysis of crucial design factors in the paper. We conduct our experiments on two humanoid platforms and demonstrate the superiority of our approach against state-of-the-arts, providing practical guidelines to pursue the extreme of whole-body control for humanoid robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13196v1-abstract-full').style.display = 'none'; document.getElementById('2412.13196v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">website: https://exbody2.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12496">arXiv:2412.12496</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.12496">pdf</a>, <a href="https://arxiv.org/format/2412.12496">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Faster Vision Mamba is Rebuilt in Minutes via Merged Token Re-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shi%2C+M">Mingjia Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuhao Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+R">Ruiji Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zekai Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Z">Zhiyuan Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xuanlei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Vedantam%2C+S+R">Shanmukha Ramakrishna Vedantam</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wangbo Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=You%2C+Y">Yang You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12496v2-abstract-short" style="display: inline;"> Vision Mamba (e.g., Vim) has successfully been integrated into computer vision, and token reduction has yielded promising outcomes in Vision Transformers (ViTs). However, token reduction performs less effectively on Vision Mamba compared to ViTs. Pruning informative tokens in Mamba leads to a high loss of key knowledge and bad performance. This makes it not a good solution for enhancing efficiency&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12496v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12496v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12496v2-abstract-full" style="display: none;"> Vision Mamba (e.g., Vim) has successfully been integrated into computer vision, and token reduction has yielded promising outcomes in Vision Transformers (ViTs). However, token reduction performs less effectively on Vision Mamba compared to ViTs. Pruning informative tokens in Mamba leads to a high loss of key knowledge and bad performance. This makes it not a good solution for enhancing efficiency in Mamba. Token merging, which preserves more token information than pruning, has demonstrated commendable performance in ViTs. Nevertheless, vanilla merging performance decreases as the reduction ratio increases either, failing to maintain the key knowledge in Mamba. Re-training the token-reduced model enhances the performance of Mamba, by effectively rebuilding the key knowledge. Empirically, pruned Vims only drop up to 0.9% accuracy on ImageNet-1K, recovered by our proposed framework R-MeeTo in our main evaluation. We show how simple and effective the fast recovery can be achieved at minute-level, in particular, a 35.9% accuracy spike over 3 epochs of training on Vim-Ti. Moreover, Vim-Ti/S/B are re-trained within 5/7/17 minutes, and Vim-S only drop 1.3% with 1.2x (up to 1.5x) speed up in inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12496v2-abstract-full').style.display = 'none'; document.getElementById('2412.12496v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12300">arXiv:2412.12300</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.12300">pdf</a>, <a href="https://arxiv.org/format/2412.12300">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unanswerability Evaluation for Retrieval Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiangyu Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Choubey%2C+P+K">Prafulla Kumar Choubey</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chien-Sheng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12300v2-abstract-short" style="display: inline;"> Existing evaluation frameworks for retrieval-augmented generation (RAG) systems focus on answerable queries, but they overlook the importance of appropriately rejecting unanswerable requests. In this paper, we introduce UAEval4RAG, a framework designed to evaluate whether RAG systems can handle unanswerable queries effectively. We define a taxonomy with six unanswerable categories, and UAEval4RAG&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12300v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12300v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12300v2-abstract-full" style="display: none;"> Existing evaluation frameworks for retrieval-augmented generation (RAG) systems focus on answerable queries, but they overlook the importance of appropriately rejecting unanswerable requests. In this paper, we introduce UAEval4RAG, a framework designed to evaluate whether RAG systems can handle unanswerable queries effectively. We define a taxonomy with six unanswerable categories, and UAEval4RAG automatically synthesizes diverse and challenging queries for any given knowledge base with unanswered ratio and acceptable ratio metrics. We conduct experiments with various RAG components, including retrieval models, rewriting methods, rerankers, language models, and prompting strategies, and reveal hidden trade-offs in performance of RAG systems. Our findings highlight the critical role of component selection and prompt design in optimizing RAG systems to balance the accuracy of answerable queries with high rejection rates of unanswerable ones. UAEval4RAG provides valuable insights and tools for developing more robust and reliable RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12300v2-abstract-full').style.display = 'none'; document.getElementById('2412.12300v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10164">arXiv:2412.10164</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.10164">pdf</a>, <a href="https://arxiv.org/format/2412.10164">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Keep It Simple: Towards Accurate Vulnerability Detection for Large Code Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Y">Yihao Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liqian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiaoguang Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10164v1-abstract-short" style="display: inline;"> Software vulnerability detection is crucial for high-quality software development. Recently, some studies utilizing Graph Neural Networks (GNNs) to learn the graph representation of code in vulnerability detection tasks have achieved remarkable success. However, existing graph-based approaches mainly face two limitations that prevent them from generalizing well to large code graphs: (1) the interf&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10164v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10164v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10164v1-abstract-full" style="display: none;"> Software vulnerability detection is crucial for high-quality software development. Recently, some studies utilizing Graph Neural Networks (GNNs) to learn the graph representation of code in vulnerability detection tasks have achieved remarkable success. However, existing graph-based approaches mainly face two limitations that prevent them from generalizing well to large code graphs: (1) the interference of noise information in the code graph; (2) the difficulty in capturing long-distance dependencies within the graph. To mitigate these problems, we propose a novel vulnerability detection method, ANGLE, whose novelty mainly embodies the hierarchical graph refinement and context-aware graph representation learning. The former hierarchically filters redundant information in the code graph, thereby reducing the size of the graph, while the latter collaboratively employs the Graph Transformer and GNN to learn code graph representations from both the global and local perspectives, thus capturing long-distance dependencies. Extensive experiments demonstrate promising results on three widely used benchmark datasets: our method significantly outperforms several other baselines in terms of the accuracy and F1 score. Particularly, in large code graphs, ANGLE achieves an improvement in accuracy of 34.27%-161.93% compared to the state-of-the-art method, AMPLE. Such results demonstrate the effectiveness of ANGLE in vulnerability detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10164v1-abstract-full').style.display = 'none'; document.getElementById('2412.10164v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08316">arXiv:2412.08316</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.08316">pdf</a>, <a href="https://arxiv.org/format/2412.08316">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Rumor Detection on Social Media with Temporal Propagation Structure Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xingyu Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Junran Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Ruomei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Ke Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08316v2-abstract-short" style="display: inline;"> Traditional methods for detecting rumors on social media primarily focus on analyzing textual content, often struggling to capture the complexity of online interactions. Recent research has shifted towards leveraging graph neural networks to model the hierarchical conversation structure that emerges during rumor propagation. However, these methods tend to overlook the temporal aspect of rumor prop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08316v2-abstract-full').style.display = 'inline'; document.getElementById('2412.08316v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08316v2-abstract-full" style="display: none;"> Traditional methods for detecting rumors on social media primarily focus on analyzing textual content, often struggling to capture the complexity of online interactions. Recent research has shifted towards leveraging graph neural networks to model the hierarchical conversation structure that emerges during rumor propagation. However, these methods tend to overlook the temporal aspect of rumor propagation and may disregard potential noise within the propagation structure. In this paper, we propose a novel approach that incorporates temporal information by constructing a weighted propagation tree, where the weight of each edge represents the time interval between connected posts. Drawing upon the theory of structural entropy, we transform this tree into a coding tree. This transformation aims to preserve the essential structure of rumor propagation while reducing noise. Finally, we introduce a recursive neural network to learn from the coding tree for rumor veracity prediction. Experimental results on two common datasets demonstrate the superiority of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08316v2-abstract-full').style.display = 'none'; document.getElementById('2412.08316v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">COLING&#39;25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07233">arXiv:2412.07233</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07233">pdf</a>, <a href="https://arxiv.org/format/2412.07233">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Repetitive Action Counting with Hybrid Temporal Relation Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xinge Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07233v1-abstract-short" style="display: inline;"> Repetitive Action Counting (RAC) aims to count the number of repetitive actions occurring in videos. In the real world, repetitive actions have great diversity and bring numerous challenges (e.g., viewpoint changes, non-uniform periods, and action interruptions). Existing methods based on the temporal self-similarity matrix (TSSM) for RAC are trapped in the bottleneck of insufficient capturing act&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07233v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07233v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07233v1-abstract-full" style="display: none;"> Repetitive Action Counting (RAC) aims to count the number of repetitive actions occurring in videos. In the real world, repetitive actions have great diversity and bring numerous challenges (e.g., viewpoint changes, non-uniform periods, and action interruptions). Existing methods based on the temporal self-similarity matrix (TSSM) for RAC are trapped in the bottleneck of insufficient capturing action periods when applied to complicated daily videos. To tackle this issue, we propose a novel method named Hybrid Temporal Relation Modeling Network (HTRM-Net) to build diverse TSSM for RAC. The HTRM-Net mainly consists of three key components: bi-modal temporal self-similarity matrix modeling, random matrix dropping, and local temporal context modeling. Specifically, we construct temporal self-similarity matrices by bi-modal (self-attention and dual-softmax) operations, yielding diverse matrix representations from the combination of row-wise and column-wise correlations. To further enhance matrix representations, we propose incorporating a random matrix dropping module to guide channel-wise learning of the matrix explicitly. After that, we inject the local temporal context of video frames and the learned matrix into temporal correlation modeling, which can make the model robust enough to cope with error-prone situations, such as action interruption. Finally, a multi-scale matrix fusion module is designed to aggregate temporal correlations adaptively in multi-scale matrices. Extensive experiments across intra- and cross-datasets demonstrate that the proposed method not only outperforms current state-of-the-art methods but also exhibits robust capabilities in accurately counting repetitive actions in unseen action categories. Notably, our method surpasses the classical TransRAC method by 20.04\% in MAE and 22.76\% in OBO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07233v1-abstract-full').style.display = 'none'; document.getElementById('2412.07233v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in IEEE Transactions on Multimedia</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07202">arXiv:2412.07202</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.07202">pdf</a>, <a href="https://arxiv.org/format/2412.07202">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BrokerChain: A Blockchain Sharding Protocol by Exploiting Broker Accounts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Huawei Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+Z">Zhaokang Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qinde Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+G">Guang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaowen Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yue Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zibin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+S">Song Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07202v1-abstract-short" style="display: inline;"> State-of-the-art blockchain sharding solutions such as Monoxide, can cause severely imbalanced distribution of transaction (TX) workloads across all blockchain shards due to the deployment policy of their accounts. Imbalanced TX distributions then produce hot shards, in which the cross-shard TXs may experience an unlimited confirmation latency. Thus, how to address the hot-shard issue and how to r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07202v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07202v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07202v1-abstract-full" style="display: none;"> State-of-the-art blockchain sharding solutions such as Monoxide, can cause severely imbalanced distribution of transaction (TX) workloads across all blockchain shards due to the deployment policy of their accounts. Imbalanced TX distributions then produce hot shards, in which the cross-shard TXs may experience an unlimited confirmation latency. Thus, how to address the hot-shard issue and how to reduce crossshard TXs become significant challenges of blockchain sharding. Through reviewing the related studies, we find that a crossshard TX protocol that can achieve workload balance among all shards and simultaneously reduce the quantity of crossshard TXs is still absent from the literature. To this end, we propose BrokerChain, which is a cross-shard blockchain protocol dedicated to account-based state sharding. Essentially, BrokerChain exploits fine-grained state partition and account segmentation. We also elaborate on how BrokerChain handles cross-shard TXs through broker accounts. The security issues and other properties of BrokerChain are analyzed rigorously. Finally, we conduct comprehensive evaluations using an opensource blockchain sharding prototype named BlockEmulator. The evaluation results show that BrokerChain outperforms other baselines in terms of transaction throughput, transaction confirmation latency, the queue size of the transaction pool, and workload balance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07202v1-abstract-full').style.display = 'none'; document.getElementById('2412.07202v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19577">arXiv:2411.19577</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.19577">pdf</a>, <a href="https://arxiv.org/format/2411.19577">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RoadGen: Generating Road Scenarios for Autonomous Vehicle Testing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">You Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bihuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+P">Peng Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19577v1-abstract-short" style="display: inline;"> With the rapid development of autonomous vehicles, there is an increasing demand for scenario-based testing to simulate diverse driving scenarios. However, as the base of any driving scenarios, road scenarios (e.g., road topology and geometry) have received little attention by the literature. Despite several advances, they either generate basic road components without a complete road network, or g&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19577v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19577v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19577v1-abstract-full" style="display: none;"> With the rapid development of autonomous vehicles, there is an increasing demand for scenario-based testing to simulate diverse driving scenarios. However, as the base of any driving scenarios, road scenarios (e.g., road topology and geometry) have received little attention by the literature. Despite several advances, they either generate basic road components without a complete road network, or generate a complete road network but with simple road components. The resulting road scenarios lack diversity in both topology and geometry. To address this problem, we propose RoadGen to systematically generate diverse road scenarios. The key idea is to connect eight types of parameterized road components to form road scenarios with high diversity in topology and geometry. Our evaluation has demonstrated the effectiveness and usefulness of RoadGen in generating diverse road scenarios for simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19577v1-abstract-full').style.display = 'none'; document.getElementById('2411.19577v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19567">arXiv:2411.19567</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.19567">pdf</a>, <a href="https://arxiv.org/format/2411.19567">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> AdvFuzz: Finding More Violations Caused by the EGO Vehicle in Simulation Testing by Adversarial NPC Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">You Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yifan Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dingji Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bihuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19567v1-abstract-short" style="display: inline;"> Recently, there has been a significant escalation in both academic and industrial commitment towards the development of autonomous driving systems (ADSs). A number of simulation testing approaches have been proposed to generate diverse driving scenarios for ADS testing. However, scenarios generated by these previous approaches are static and lack interactions between the EGO vehicle and the NPC ve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19567v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19567v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19567v1-abstract-full" style="display: none;"> Recently, there has been a significant escalation in both academic and industrial commitment towards the development of autonomous driving systems (ADSs). A number of simulation testing approaches have been proposed to generate diverse driving scenarios for ADS testing. However, scenarios generated by these previous approaches are static and lack interactions between the EGO vehicle and the NPC vehicles, resulting in a large amount of time on average to find violation scenarios. Besides, a large number of the violations they found are caused by aggressive behaviors of NPC vehicles, revealing none bugs of ADS. In this work, we propose the concept of adversarial NPC vehicles and introduce AdvFuzz, a novel simulation testing approach, to generate adversarial scenarios on main lanes (e.g., urban roads and highways). AdvFuzz allows NPC vehicles to dynamically interact with the EGO vehicle and regulates the behaviors of NPC vehicles, finding more violation scenarios caused by the EGO vehicle more quickly. We compare AdvFuzz with a random approach and three state-of-the-art scenario-based testing approaches. Our experiments demonstrate that AdvFuzz can generate 198.34% more violation scenarios compared to the other four approaches in 12 hours and increase the proportion of violations caused by the EGO vehicle to 87.04%, which is more than 7 times that of other approaches. Additionally, AdvFuzz is at least 92.21% faster in finding one violation caused by the EGO vehicle than that of the other approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19567v1-abstract-full').style.display = 'none'; document.getElementById('2411.19567v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18499">arXiv:2411.18499</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18499">pdf</a>, <a href="https://arxiv.org/format/2411.18499">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GATE OpenING: A Comprehensive Benchmark for Judging Open-ended Interleaved Image-Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+P">Pengfei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaopeng Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Jiajun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chuanhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhaopan Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Ziyao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yuqi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yefei He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+L">Lirui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shuo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianhua Li</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yuxuan Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18499v2-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding and generation tasks. However, generating interleaved image-text content remains a challenge, which requires integrated multimodal understanding and generation abilities. While the progress in unified models offers new solutions, existing benchmarks are insufficient for evaluating these methods due to da&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18499v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18499v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18499v2-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have made significant strides in visual understanding and generation tasks. However, generating interleaved image-text content remains a challenge, which requires integrated multimodal understanding and generation abilities. While the progress in unified models offers new solutions, existing benchmarks are insufficient for evaluating these methods due to data size and diversity limitations. To bridge this gap, we introduce GATE OpenING (OpenING), a comprehensive benchmark comprising 5,400 high-quality human-annotated instances across 56 real-world tasks. OpenING covers diverse daily scenarios such as travel guide, design, and brainstorming, offering a robust platform for challenging interleaved generation methods. In addition, we present IntJudge, a judge model for evaluating open-ended multimodal generation methods. Trained with a novel data pipeline, our IntJudge achieves an agreement rate of 82. 42% with human judgments, outperforming GPT-based evaluators by 11.34%. Extensive experiments on OpenING reveal that current interleaved generation methods still have substantial room for improvement. Key findings on interleaved image-text generation are further presented to guide the development of next-generation models. The OpenING is open-sourced at https://opening-benchmark.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18499v2-abstract-full').style.display = 'none'; document.getElementById('2411.18499v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">53 pages, 19 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17230">arXiv:2411.17230</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17230">pdf</a>, <a href="https://arxiv.org/format/2411.17230">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Fault Localization from the Semantic Code Search Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Y">Yihao Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+Y">Yan Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhuo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Liqian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+X">Xiaoguang Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17230v1-abstract-short" style="display: inline;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17230v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17230v1-abstract-full" style="display: none;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization (FL), which identifies code entities responsible for bugs within the software project to boost software debugging. These two tasks exhibit similarities since they both address search problems. Notably, CS techniques have demonstrated greater effectiveness than FL ones, possibly because of the precise semantic details of the required code offered by natural language queries, which are not readily accessible to FL methods. Drawing inspiration from this, we hypothesize that a fault localizer could achieve greater proficiency if semantic information about the buggy methods were made available. Based on this idea, we propose CosFL, an FL approach that decomposes the FL task into two steps: query generation, which describes the functionality of the problematic code in natural language, and fault retrieval, which uses CS to find program elements semantically related to the query. Specifically, to depict the buggy functionalities and generate high-quality queries, CosFL extensively harnesses the code analysis, semantic comprehension, and decision-making capabilities of LLMs. Moreover, to enhance the accuracy of CS, CosFL captures varying levels of context information and employs a multi-granularity code search strategy, which facilitates a more precise identification of buggy methods from a holistic view. The evaluation on 835 real bugs from 23 Java projects shows that CosFL successfully localizes 324 bugs within Top-1, which significantly outperforms the state-of-the-art approaches by 26.6%-57.3%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'none'; document.getElementById('2411.17230v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16575">arXiv:2411.16575</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16575">pdf</a>, <a href="https://arxiv.org/format/2411.16575">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Diffusion for Text-Driven Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+Z">Zichong Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yiming Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaogang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zeyu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Huaizu Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16575v1-abstract-short" style="display: inline;"> Since 2023, Vector Quantization (VQ)-based discrete generation methods have rapidly dominated human motion generation, primarily surpassing diffusion-based continuous generation methods in standard performance metrics. However, VQ-based methods have inherent limitations. Representing continuous motion data as limited discrete tokens leads to inevitable information loss, reduces the diversity of ge&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16575v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16575v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16575v1-abstract-full" style="display: none;"> Since 2023, Vector Quantization (VQ)-based discrete generation methods have rapidly dominated human motion generation, primarily surpassing diffusion-based continuous generation methods in standard performance metrics. However, VQ-based methods have inherent limitations. Representing continuous motion data as limited discrete tokens leads to inevitable information loss, reduces the diversity of generated motions, and restricts their ability to function effectively as motion priors or generation guidance. In contrast, the continuous space generation nature of diffusion-based methods makes them well-suited to address these limitations and with even potential for model scalability. In this work, we systematically investigate why current VQ-based methods perform well and explore the limitations of existing diffusion-based methods from the perspective of motion data representation and distribution. Drawing on these insights, we preserve the inherent strengths of a diffusion-based human motion generation model and gradually optimize it with inspiration from VQ-based approaches. Our approach introduces a human motion diffusion model enabled to perform bidirectional masked autoregression, optimized with a reformed data representation and distribution. Additionally, we also propose more robust evaluation methods to fairly assess different-based methods. Extensive experiments on benchmark human motion generation datasets demonstrate that our method excels previous methods and achieves state-of-the-art performances. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16575v1-abstract-full').style.display = 'none'; document.getElementById('2411.16575v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15206">arXiv:2411.15206</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15206">pdf</a>, <a href="https://arxiv.org/ps/2411.15206">ps</a>, <a href="https://arxiv.org/format/2411.15206">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Conditional Distribution Learning on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+H">Hua Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Gou%2C+Y">Yuanbiao Gou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15206v2-abstract-short" style="display: inline;"> Leveraging the diversity and quantity of data provided by various graph-structured data augmentations while preserving intrinsic semantic information is challenging. Additionally, successive layers in graph neural network (GNN) tend to produce more similar node embeddings, while graph contrastive learning aims to increase the dissimilarity between negative pairs of node embeddings. This inevitably&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15206v2-abstract-full').style.display = 'inline'; document.getElementById('2411.15206v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15206v2-abstract-full" style="display: none;"> Leveraging the diversity and quantity of data provided by various graph-structured data augmentations while preserving intrinsic semantic information is challenging. Additionally, successive layers in graph neural network (GNN) tend to produce more similar node embeddings, while graph contrastive learning aims to increase the dissimilarity between negative pairs of node embeddings. This inevitably results in a conflict between the message-passing mechanism (MPM) of GNNs and the contrastive learning (CL) of negative pairs via intraviews. In this paper, we propose a conditional distribution learning (CDL) method that learns graph representations from graph-structured data for semisupervised graph classification. Specifically, we present an end-to-end graph representation learning model to align the conditional distributions of weakly and strongly augmented features over the original features. This alignment enables the CDL model to effectively preserve intrinsic semantic information when both weak and strong augmentations are applied to graph-structured data. To avoid the conflict between the MPM and the CL of negative pairs, positive pairs of node representations are retained for measuring the similarity between the original features and the corresponding weakly augmented features. Extensive experiments with several benchmark graph datasets demonstrate the effectiveness of the proposed CDL method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15206v2-abstract-full').style.display = 'none'; document.getElementById('2411.15206v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15131">arXiv:2411.15131</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15131">pdf</a>, <a href="https://arxiv.org/format/2411.15131">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WildLMa: Long Horizon Loco-Manipulation in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+R">Ri-Zhao Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Y">Yuchen Song</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xuanbin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Suryadevara%2C+S+A">Sai Aneesh Suryadevara</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+G">Ge Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Minghuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+M">Mazeyu Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+C">Chengzhe Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+R">Ruihan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+X">Xueyan Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiaolong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15131v1-abstract-short" style="display: inline;"> `In-the-wild&#39; mobile manipulation aims to deploy robots in diverse real-world environments, which requires the robot to (1) have skills that generalize across object configurations; (2) be capable of long-horizon task execution in diverse environments; and (3) perform complex manipulation beyond pick-and-place. Quadruped robots with manipulators hold promise for extending the workspace and enablin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15131v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15131v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15131v1-abstract-full" style="display: none;"> `In-the-wild&#39; mobile manipulation aims to deploy robots in diverse real-world environments, which requires the robot to (1) have skills that generalize across object configurations; (2) be capable of long-horizon task execution in diverse environments; and (3) perform complex manipulation beyond pick-and-place. Quadruped robots with manipulators hold promise for extending the workspace and enabling robust locomotion, but existing results do not investigate such a capability. This paper proposes WildLMa with three components to address these issues: (1) adaptation of learned low-level controller for VR-enabled whole-body teleoperation and traversability; (2) WildLMa-Skill -- a library of generalizable visuomotor skills acquired via imitation learning or heuristics and (3) WildLMa-Planner -- an interface of learned skills that allow LLM planners to coordinate skills for long-horizon tasks. We demonstrate the importance of high-quality training data by achieving higher grasping success rate over existing RL baselines using only tens of demonstrations. WildLMa exploits CLIP for language-conditioned imitation learning that empirically generalizes to objects unseen in training demonstrations. Besides extensive quantitative evaluation, we qualitatively demonstrate practical robot applications, such as cleaning up trash in university hallways or outdoor terrains, operating articulated objects, and rearranging items on a bookshelf. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15131v1-abstract-full').style.display = 'none'; document.getElementById('2411.15131v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://wildlma.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14049">arXiv:2411.14049</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14049">pdf</a>, <a href="https://arxiv.org/format/2411.14049">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Out-Of-Distribution Detection with Diversification (Provably) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Haiyun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+Z">Zongbo Han</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+H">Huazhu Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Q">Qinghua Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Changqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14049v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) detection is crucial for ensuring reliable deployment of machine learning models. Recent advancements focus on utilizing easily accessible auxiliary outliers (e.g., data from the web or other datasets) in training. However, we experimentally reveal that these methods still struggle to generalize their detection capabilities to unknown OOD data, due to the limited diversit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14049v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14049v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14049v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) detection is crucial for ensuring reliable deployment of machine learning models. Recent advancements focus on utilizing easily accessible auxiliary outliers (e.g., data from the web or other datasets) in training. However, we experimentally reveal that these methods still struggle to generalize their detection capabilities to unknown OOD data, due to the limited diversity of the auxiliary outliers collected. Therefore, we thoroughly examine this problem from the generalization perspective and demonstrate that a more diverse set of auxiliary outliers is essential for enhancing the detection capabilities. However, in practice, it is difficult and costly to collect sufficiently diverse auxiliary outlier data. Therefore, we propose a simple yet practical approach with a theoretical guarantee, termed Diversity-induced Mixup for OOD detection (diverseMix), which enhances the diversity of auxiliary outlier set for training in an efficient way. Extensive experiments show that diverseMix achieves superior performance on commonly used and recent challenging large-scale benchmarks, which further confirm the importance of the diversity of auxiliary outliers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14049v1-abstract-full').style.display = 'none'; document.getElementById('2411.14049v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13990">arXiv:2411.13990</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13990">pdf</a>, <a href="https://arxiv.org/format/2411.13990">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Repository-level Code Translation Benchmark Targeting Rust </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ou%2C+G">Guangsheng Ou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Mingwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13990v3-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs) have shown significant capabilities in code translation, often evaluated using benchmarks like CodeTransOcean. However, these evaluations typically focus on simple, function-level translations without considering dependencies, which does not reflect the complexities of real-world software development. Further, their effectiveness in translating to ne&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13990v3-abstract-full').style.display = 'inline'; document.getElementById('2411.13990v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13990v3-abstract-full" style="display: none;"> Recent advances in large language models (LLMs) have shown significant capabilities in code translation, often evaluated using benchmarks like CodeTransOcean. However, these evaluations typically focus on simple, function-level translations without considering dependencies, which does not reflect the complexities of real-world software development. Further, their effectiveness in translating to newer, lower-resource languages like Rust in realistic scenarios is still under-explored. To address this gap, we introduce first repository-level code translation benchmark comprising 375 tasks targeting Rust, complete with relevant dependencies. Using this benchmark, we study four state-of-the-art LLMs, analyzing their erroneous outputs to understand their performance in more complex translation scenarios. Our findings reveal that LLMs exhibit substantially worse performance (41.5%-56.2% Pass@1 drop of GPT-4) on repository-level translations compared to simpler tasks, highlighting limitations in existing evaluation methods. The model that performed the best is Claude-3.5, demonstrating the strongest translation capabilities in both basic functionality accuracy and several relevant additional abilities. Additionally, we discover that LLMs struggle with identifying language differences in complex tasks, and that increased dependencies correlate with greater translation difficulty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13990v3-abstract-full').style.display = 'none'; document.getElementById('2411.13990v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13050">arXiv:2411.13050</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13050">pdf</a>, <a href="https://arxiv.org/format/2411.13050">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Topkima-Former: Low-energy, Low-Latency Inference for Transformers using top-k In-memory ADC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+S">Shuai Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Junyi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaoqi Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+H">Hongyang Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Ke%2C+Y">Ye Ke</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiaofeng Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongjie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Basu%2C+A">Arindam Basu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13050v1-abstract-short" style="display: inline;"> Transformer model has gained prominence as a popular deep neural network architecture for neural language processing (NLP) and computer vision (CV) applications. However, the extensive use of nonlinear operations, like softmax, poses a performance bottleneck during transformer inference and comprises up to 40% of the total latency. Hence, we propose innovations at the circuit, architecture, and al&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13050v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13050v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13050v1-abstract-full" style="display: none;"> Transformer model has gained prominence as a popular deep neural network architecture for neural language processing (NLP) and computer vision (CV) applications. However, the extensive use of nonlinear operations, like softmax, poses a performance bottleneck during transformer inference and comprises up to 40% of the total latency. Hence, we propose innovations at the circuit, architecture, and algorithm levels to accelerate the transformer. At the circuit level, we propose topkima-combining top-k activation selection with in-memory ADC (IMA) to implement a low-energy and low-latency softmax without any sorting latency. Only the k largest activations are sent to the softmax calculation block, reducing the huge computational cost of softmax. Using a modified training scheme with top-k only in the forward pass, experimental results demonstrate only a 0.4% to 1.2% reduction in accuracy across ViT, distilBERT, and BERT-base models when evaluated on CIFAR-10, CIFAR-100, and SQuAD datasets with k=5. At the architecture level, an improved scale-free technique is introduced to reduce the computational cost of attention. The combined system, dubbed Topkima-Former, enhances 1.8x-84x speedup and 1.3x-35x energy efficiency (EE) over prior In-memory computing (IMC) accelerators. Compared to a conventional softmax macro and a digital top-k (Dtopk) softmax macro, our proposed tokima softmax macro achieves about 15x and 8x faster speed respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13050v1-abstract-full').style.display = 'none'; document.getElementById('2411.13050v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11027">arXiv:2411.11027</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11027">pdf</a>, <a href="https://arxiv.org/format/2411.11027">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BianCang: A Traditional Chinese Medicine Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wei%2C+S">Sibo Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xueping Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yi-fei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+J">Jiasheng Si</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weiyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+W">Wenpeng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaoming Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yinglong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11027v1-abstract-short" style="display: inline;"> The rise of large language models (LLMs) has driven significant progress in medical applications, including traditional Chinese medicine (TCM). However, current medical LLMs struggle with TCM diagnosis and syndrome differentiation due to substantial differences between TCM and modern medical theory, and the scarcity of specialized, high-quality corpora. This paper addresses these challenges by pro&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11027v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11027v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11027v1-abstract-full" style="display: none;"> The rise of large language models (LLMs) has driven significant progress in medical applications, including traditional Chinese medicine (TCM). However, current medical LLMs struggle with TCM diagnosis and syndrome differentiation due to substantial differences between TCM and modern medical theory, and the scarcity of specialized, high-quality corpora. This paper addresses these challenges by proposing BianCang, a TCM-specific LLM, using a two-stage training process that first injects domain-specific knowledge and then aligns it through targeted stimulation. To enhance diagnostic and differentiation capabilities, we constructed pre-training corpora, instruction-aligned datasets based on real hospital records, and the ChP-TCM dataset derived from the Pharmacopoeia of the People&#39;s Republic of China. We compiled extensive TCM and medical corpora for continuous pre-training and supervised fine-tuning, building a comprehensive dataset to refine the model&#39;s understanding of TCM. Evaluations across 11 test sets involving 29 models and 4 tasks demonstrate the effectiveness of BianCang, offering valuable insights for future research. Code, datasets, and models are available at https://github.com/QLU-NLP/BianCang. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11027v1-abstract-full').style.display = 'none'; document.getElementById('2411.11027v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10023">arXiv:2411.10023</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10023">pdf</a>, <a href="https://arxiv.org/format/2411.10023">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model Inversion Attacks: A Survey of Approaches and Countermeasures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhanke Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jianing Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+F">Fengfei Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiong Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tongliang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+B">Bo Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10023v1-abstract-short" style="display: inline;"> The success of deep neural networks has driven numerous research studies and applications from Euclidean to non-Euclidean data. However, there are increasing concerns about privacy leakage, as these networks rely on processing private data. Recently, a new type of privacy attack, the model inversion attacks (MIAs), aims to extract sensitive features of private data for training by abusing access t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10023v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10023v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10023v1-abstract-full" style="display: none;"> The success of deep neural networks has driven numerous research studies and applications from Euclidean to non-Euclidean data. However, there are increasing concerns about privacy leakage, as these networks rely on processing private data. Recently, a new type of privacy attack, the model inversion attacks (MIAs), aims to extract sensitive features of private data for training by abusing access to a well-trained model. The effectiveness of MIAs has been demonstrated in various domains, including images, texts, and graphs. These attacks highlight the vulnerability of neural networks and raise awareness about the risk of privacy leakage within the research community. Despite the significance, there is a lack of systematic studies that provide a comprehensive overview and deeper insights into MIAs across different domains. This survey aims to summarize up-to-date MIA methods in both attacks and defenses, highlighting their contributions and limitations, underlying modeling principles, optimization challenges, and future directions. We hope this survey bridges the gap in the literature and facilitates future research in this critical area. Besides, we are maintaining a repository to keep track of relevant research at https://github.com/AndrewZhou924/Awesome-model-inversion-attack. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10023v1-abstract-full').style.display = 'none'; document.getElementById('2411.10023v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10020">arXiv:2411.10020</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10020">pdf</a>, <a href="https://arxiv.org/format/2411.10020">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Information Extraction from Clinical Notes: Are We Ready to Switch to Large Language Models? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+X">Xu Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yujia Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xueqing Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Jimin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Keloth%2C+V+K">Vipina K. Keloth</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+V+J">Vincent J. Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+R">Ruey-Ling Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Q">Qingyu Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+X">Xiaoqian Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Roberts%2C+K+E">Kirk E. Roberts</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hua Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10020v4-abstract-short" style="display: inline;"> Backgrounds: Information extraction (IE) is critical in clinical natural language processing (NLP). While large language models (LLMs) excel on generative tasks, their performance on extractive tasks remains debated. Methods: We investigated Named Entity Recognition (NER) and Relation Extraction (RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples, MIMIC-III, and i2b2). We d&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10020v4-abstract-full').style.display = 'inline'; document.getElementById('2411.10020v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10020v4-abstract-full" style="display: none;"> Backgrounds: Information extraction (IE) is critical in clinical natural language processing (NLP). While large language models (LLMs) excel on generative tasks, their performance on extractive tasks remains debated. Methods: We investigated Named Entity Recognition (NER) and Relation Extraction (RE) using 1,588 clinical notes from four sources (UT Physicians, MTSamples, MIMIC-III, and i2b2). We developed an annotated corpus covering 4 clinical entities and 16 modifiers, and compared instruction-tuned LLaMA-2 and LLaMA-3 against BERT in terms of performance, generalizability, computational resources, and throughput to BERT. Results: LLaMA models outperformed BERT across datasets. With sufficient training data, LLaMA showed modest improvements (1% on NER, 1.5-3.7% on RE); improvements were larger with limited training data. On unseen i2b2 data, LLaMA-3-70B outperformed BERT by 7% (F1) on NER and 4% on RE. However, LLaMA models required more computing resources and ran up to 28 times slower. We implemented &#34;Kiwi,&#34; a clinical IE package featuring both models, available at https://kiwi.clinicalnlp.org/. Conclusion: This study is among the first to develop and evaluate a comprehensive clinical IE system using open-source LLMs. Results indicate that LLaMA models outperform BERT for clinical NER and RE but with higher computational costs and lower throughputs. These findings highlight that choosing between LLMs and traditional deep learning methods for clinical IE applications should remain task-specific, taking into account both performance metrics and practical considerations such as available computing resources and the intended use case scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10020v4-abstract-full').style.display = 'none'; document.getElementById('2411.10020v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07672">arXiv:2411.07672</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07672">pdf</a>, <a href="https://arxiv.org/format/2411.07672">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Structure Learning For Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yilun Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhuofan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziming Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lihui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07672v1-abstract-short" style="display: inline;"> To improve the performance of Graph Neural Networks (GNNs), Graph Structure Learning (GSL) has been extensively applied to reconstruct or refine original graph structures, effectively addressing issues like heterophily, over-squashing, and noisy structures. While GSL is generally thought to improve GNN performance, it often leads to longer training times and more hyperparameter tuning. Besides, th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07672v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07672v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07672v1-abstract-full" style="display: none;"> To improve the performance of Graph Neural Networks (GNNs), Graph Structure Learning (GSL) has been extensively applied to reconstruct or refine original graph structures, effectively addressing issues like heterophily, over-squashing, and noisy structures. While GSL is generally thought to improve GNN performance, it often leads to longer training times and more hyperparameter tuning. Besides, the distinctions among current GSL methods remain ambiguous from the perspective of GNN training, and there is a lack of theoretical analysis to quantify their effectiveness. Recent studies further suggest that, under fair comparisons with the same hyperparameter tuning, GSL does not consistently outperform baseline GNNs. This motivates us to ask a critical question: is GSL really useful for GNNs? To address this question, this paper makes two key contributions. First, we propose a new GSL framework, which includes three steps: GSL base (the representation used for GSL) construction, new structure construction, and view fusion, to better understand the effectiveness of GSL in GNNs. Second, after graph convolution, we analyze the differences in mutual information (MI) between node representations derived from the original topology and those from the newly constructed topology. Surprisingly, our empirical observations and theoretical analysis show that no matter which type of graph structure construction methods are used, after feeding the same GSL bases to the newly constructed graph, there is no MI gain compared to the original GSL bases. To fairly reassess the effectiveness of GSL, we conduct ablation experiments and find that it is the pretrained GSL bases that enhance GNN performance, and in most cases, GSL cannot improve GNN performance. This finding encourages us to rethink the essential components in GNNs, such as self-training and structural encoding, in GNN design rather than GSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07672v1-abstract-full').style.display = 'none'; document.getElementById('2411.07672v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07663">arXiv:2411.07663</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07663">pdf</a>, <a href="https://arxiv.org/format/2411.07663">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Is Graph Convolution Always Beneficial For Every Feature? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yilun Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Lihui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07663v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have demonstrated strong capabilities in processing structured data. While traditional GNNs typically treat each feature dimension equally during graph convolution, we raise an important question: Is the graph convolution operation equally beneficial for each feature? If not, the convolution operation on certain feature dimensions can possibly lead to harmful effects,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07663v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07663v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07663v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have demonstrated strong capabilities in processing structured data. While traditional GNNs typically treat each feature dimension equally during graph convolution, we raise an important question: Is the graph convolution operation equally beneficial for each feature? If not, the convolution operation on certain feature dimensions can possibly lead to harmful effects, even worse than the convolution-free models. In prior studies, to assess the impacts of graph convolution on features, people proposed metrics based on feature homophily to measure feature consistency with the graph topology. However, these metrics have shown unsatisfactory alignment with GNN performance and have not been effectively employed to guide feature selection in GNNs. To address these limitations, we introduce a novel metric, Topological Feature Informativeness (TFI), to distinguish between GNN-favored and GNN-disfavored features, where its effectiveness is validated through both theoretical analysis and empirical observations. Based on TFI, we propose a simple yet effective Graph Feature Selection (GFS) method, which processes GNN-favored and GNN-disfavored features separately, using GNNs and non-GNN models. Compared to original GNNs, GFS significantly improves the extraction of useful topological information from each feature with comparable computational costs. Extensive experiments show that after applying GFS to 8 baseline and state-of-the-art (SOTA) GNN architectures across 10 datasets, 83.75% of the GFS-augmented cases show significant performance boosts. Furthermore, our proposed TFI metric outperforms other feature selection methods. These results validate the effectiveness of both GFS and TFI. Additionally, we demonstrate that GFS&#39;s improvements are robust to hyperparameter tuning, highlighting its potential as a universal method for enhancing various GNN architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07663v1-abstract-full').style.display = 'none'; document.getElementById('2411.07663v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03522">arXiv:2411.03522</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03522">pdf</a>, <a href="https://arxiv.org/format/2411.03522">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Potentials and Challenges of Using Large Language Models for the Analysis of Transcriptional Regulation of Long Non-coding RNAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+Z">Zhichao Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaorui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xinxia Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03522v1-abstract-short" style="display: inline;"> Research on long non-coding RNAs (lncRNAs) has garnered significant attention due to their critical roles in gene regulation and disease mechanisms. However, the complexity and diversity of lncRNA sequences, along with the limited knowledge of their functional mechanisms and the regulation of their expressions, pose significant challenges to lncRNA studies. Given the tremendous success of large la&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03522v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03522v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03522v1-abstract-full" style="display: none;"> Research on long non-coding RNAs (lncRNAs) has garnered significant attention due to their critical roles in gene regulation and disease mechanisms. However, the complexity and diversity of lncRNA sequences, along with the limited knowledge of their functional mechanisms and the regulation of their expressions, pose significant challenges to lncRNA studies. Given the tremendous success of large language models (LLMs) in capturing complex dependencies in sequential data, this study aims to systematically explore the potential and limitations of LLMs in the sequence analysis related to the transcriptional regulation of lncRNA genes. Our extensive experiments demonstrated promising performance of fine-tuned genome foundation models on progressively complex tasks. Furthermore, we conducted an insightful analysis of the critical impact of task complexity, model selection, data quality, and biological interpretability for the studies of the regulation of lncRNA gene expression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03522v1-abstract-full').style.display = 'none'; document.getElementById('2411.03522v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00172">arXiv:2411.00172</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00172">pdf</a>, <a href="https://arxiv.org/format/2411.00172">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SeafloorAI: A Large-scale Vision-Language Dataset for Seafloor Geological Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+K+X">Kien X. Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+F">Fengchun Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Trembanis%2C+A">Arthur Trembanis</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00172v2-abstract-short" style="display: inline;"> A major obstacle to the advancements of machine learning models in marine science, particularly in sonar imagery analysis, is the scarcity of AI-ready datasets. While there have been efforts to make AI-ready sonar image dataset publicly available, they suffer from limitations in terms of environment setting and scale. To bridge this gap, we introduce SeafloorAI, the first extensive AI-ready datase&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00172v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00172v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00172v2-abstract-full" style="display: none;"> A major obstacle to the advancements of machine learning models in marine science, particularly in sonar imagery analysis, is the scarcity of AI-ready datasets. While there have been efforts to make AI-ready sonar image dataset publicly available, they suffer from limitations in terms of environment setting and scale. To bridge this gap, we introduce SeafloorAI, the first extensive AI-ready datasets for seafloor mapping across 5 geological layers that is curated in collaboration with marine scientists. We further extend the dataset to SeafloorGenAI by incorporating the language component in order to facilitate the development of both vision- and language-capable machine learning models for sonar imagery. The dataset consists of 62 geo-distributed data surveys spanning 17,300 square kilometers, with 696K sonar images, 827K annotated segmentation masks, 696K detailed language descriptions and approximately 7M question-answer pairs. By making our data processing source code publicly available, we aim to engage the marine science community to enrich the data pool and inspire the machine learning community to develop more robust models. This collaborative approach will enhance the capabilities and applications of our datasets within both fields. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00172v2-abstract-full').style.display = 'none'; document.getElementById('2411.00172v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00132">arXiv:2411.00132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00132">pdf</a>, <a href="https://arxiv.org/format/2411.00132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Accuracy: Ensuring Correct Predictions With Correct Rationales </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+M">Mengmeng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xi Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00132v2-abstract-short" style="display: inline;"> Large pretrained foundation models demonstrate exceptional performance and, in some high-stakes applications, even surpass human experts. However, most of these models are currently evaluated primarily on prediction accuracy, overlooking the validity of the rationales behind their accurate predictions. For the safe deployment of foundation models, there is a pressing need to ensure double-correct&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00132v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00132v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00132v2-abstract-full" style="display: none;"> Large pretrained foundation models demonstrate exceptional performance and, in some high-stakes applications, even surpass human experts. However, most of these models are currently evaluated primarily on prediction accuracy, overlooking the validity of the rationales behind their accurate predictions. For the safe deployment of foundation models, there is a pressing need to ensure double-correct predictions, i.e., correct prediction backed by correct rationales. To achieve this, we propose a two-phase scheme: First, we curate a new dataset that offers structured rationales for visual recognition tasks. Second, we propose a rationale-informed optimization method to guide the model in disentangling and localizing visual evidence for each rationale, without requiring manual annotations. Extensive experiments and ablation studies demonstrate that our model outperforms state-of-the-art models by up to 10.1% in prediction accuracy across a wide range of tasks. Furthermore, our method significantly improves the model&#39;s rationale correctness, improving localization by 7.5% and disentanglement by 36.5%. Our dataset, source code, and pretrained weights: https://github.com/deep-real/DCP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00132v2-abstract-full').style.display = 'none'; document.getElementById('2411.00132v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the 38th Conference on Neural Information Processing Systems (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21218">arXiv:2410.21218</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.21218">pdf</a>, <a href="https://arxiv.org/format/2410.21218">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Lifting the Veil on the Large Language Model Supply Chain: Composition, Risks, and Mitigations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+K">Kaifeng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+B">Bihuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">You Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">Susheng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dingji Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yiheng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Haowen Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Z">Zhuotong Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Junming Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xin Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21218v2-abstract-short" style="display: inline;"> Large language models (LLM) have sparked significant impact with regard to both intelligence and productivity. In recent years, a great surge has been witnessed in the introduction of both commercial and open-source LLMs. Many businesses have adopted the LLMs into their applications to solve their own domain-specific tasks. However, integrating LLMs into specific business scenarios requires more t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21218v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21218v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21218v2-abstract-full" style="display: none;"> Large language models (LLM) have sparked significant impact with regard to both intelligence and productivity. In recent years, a great surge has been witnessed in the introduction of both commercial and open-source LLMs. Many businesses have adopted the LLMs into their applications to solve their own domain-specific tasks. However, integrating LLMs into specific business scenarios requires more than just utilizing the models themselves. Instead, it is a systematic process that involves substantial components, which are collectively referred to as the LLM supply chain. The LLM supply chain inherently carries risks. Therefore, it is essential to understand the types of components that may be introduced into the supply chain and the associated risks, enabling different stakeholders to implement effective mitigation measures. While some literature discusses risks associated with LLMs, there is currently no paper that clearly outlines the LLM supply chain from the perspective of both providing and consuming its components. As LLMs have become essential infrastructure in the new era, we believe that a thorough review of the LLM supply chain, along with its inherent risks and mitigation strategies, would be valuable for industry practitioners to avoid potential damages and losses, and enlightening for academic researchers to rethink existing approaches and explore new avenues of research. Our paper provides a comprehensive overview of the LLM supply chain, detailing the stakeholders, composing artifacts, and the supplying types. We developed taxonomies of risk types, risky actions, and mitigations related to various supply chain stakeholders and components. In summary, our work explores the technical and operational aspects of the LLM supply chain, offering valuable insights for researchers and engineers in the evolving LLM landscape. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21218v2-abstract-full').style.display = 'none'; document.getElementById('2410.21218v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20688">arXiv:2410.20688</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20688">pdf</a>, <a href="https://arxiv.org/format/2410.20688">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Reprogramming Pretrained Target-Specific Diffusion Models for Dual-Target Drug Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xiangxin Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+J">Jiaqi Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yijia Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xingang Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20688v2-abstract-short" style="display: inline;"> Dual-target therapeutic strategies have become a compelling approach and attracted significant attention due to various benefits, such as their potential in overcoming drug resistance in cancer therapy. Considering the tremendous success that deep generative models have achieved in structure-based drug design in recent years, we formulate dual-target drug design as a generative task and curate a n&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20688v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20688v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20688v2-abstract-full" style="display: none;"> Dual-target therapeutic strategies have become a compelling approach and attracted significant attention due to various benefits, such as their potential in overcoming drug resistance in cancer therapy. Considering the tremendous success that deep generative models have achieved in structure-based drug design in recent years, we formulate dual-target drug design as a generative task and curate a novel dataset of potential target pairs based on synergistic drug combinations. We propose to design dual-target drugs with diffusion models that are trained on single-target protein-ligand complex pairs. Specifically, we align two pockets in 3D space with protein-ligand binding priors and build two complex graphs with shared ligand nodes for SE(3)-equivariant composed message passing, based on which we derive a composed drift in both 3D and categorical probability space in the generative process. Our algorithm can well transfer the knowledge gained in single-target pretraining to dual-target scenarios in a zero-shot manner. We also repurpose linker design methods as strong baselines for this task. Extensive experiments demonstrate the effectiveness of our method compared with various baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20688v2-abstract-full').style.display = 'none'; document.getElementById('2410.20688v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19933">arXiv:2410.19933</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.19933">pdf</a>, <a href="https://arxiv.org/format/2410.19933">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Safety in Reinforcement Learning with Human Feedback via Rectified Policy Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Peng%2C+X">Xiyue Peng</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Hengquan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+D">Dongqing Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Z">Ziyu Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+H">Honghao Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19933v1-abstract-short" style="display: inline;"> Balancing helpfulness and safety (harmlessness) is a critical challenge in aligning large language models (LLMs). Current approaches often decouple these two objectives, training separate preference models for helpfulness and safety, while framing safety as a constraint within a constrained Markov Decision Process (CMDP) framework. However, these methods can lead to ``safety interference&#39;&#39;, where&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19933v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19933v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19933v1-abstract-full" style="display: none;"> Balancing helpfulness and safety (harmlessness) is a critical challenge in aligning large language models (LLMs). Current approaches often decouple these two objectives, training separate preference models for helpfulness and safety, while framing safety as a constraint within a constrained Markov Decision Process (CMDP) framework. However, these methods can lead to ``safety interference&#39;&#39;, where average-based safety constraints compromise the safety of some prompts in favor of others. To address this issue, we propose \textbf{Rectified Policy Optimization (RePO)}, which replaces the average safety constraint with stricter (per prompt) safety constraints. At the core of RePO is a policy update mechanism driven by rectified policy gradients, which penalizes the strict safety violation of every prompt, thereby enhancing safety across nearly all prompts. Our experiments on Alpaca-7B demonstrate that RePO improves the safety alignment and reduces the safety interference compared to baseline methods. Code is available at https://github.com/pxyWaterMoon/RePO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19933v1-abstract-full').style.display = 'none'; document.getElementById('2410.19933v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Peng%2C+X&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10