CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,472 results for author: <span class="mathjax">Zhang, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24026">arXiv:2503.24026</a> <span> [<a href="https://arxiv.org/pdf/2503.24026">pdf</a>, <a href="https://arxiv.org/format/2503.24026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HumanDreamer: Generating Controllable Human-Motion Videos via Decoupled Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofeng Wang</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chaojun Ni</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guosheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiqin Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinze Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guan Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lihong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xingang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24026v1-abstract-short" style="display: inline;"> Human-motion video generation has been a challenging task, primarily due to the difficulty inherent in learning human body movements. While some approaches have attempted to drive human-centric video generation explicitly through pose control, these methods typically rely on poses derived from existing videos, thereby lacking flexibility. To address this, we propose HumanDreamer, a decoupled human… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24026v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24026v1-abstract-full" style="display: none;"> Human-motion video generation has been a challenging task, primarily due to the difficulty inherent in learning human body movements. While some approaches have attempted to drive human-centric video generation explicitly through pose control, these methods typically rely on poses derived from existing videos, thereby lacking flexibility. To address this, we propose HumanDreamer, a decoupled human video generation framework that first generates diverse poses from text prompts and then leverages these poses to generate human-motion videos. Specifically, we propose MotionVid, the largest dataset for human-motion pose generation. Based on the dataset, we present MotionDiT, which is trained to generate structured human-motion poses from text prompts. Besides, a novel LAMA loss is introduced, which together contribute to a significant improvement in FID by 62.4%, along with respective enhancements in R-precision for top1, top2, and top3 by 41.8%, 26.3%, and 18.3%, thereby advancing both the Text-to-Pose control accuracy and FID metrics. Our experiments across various Pose-to-Video baselines demonstrate that the poses generated by our method can produce diverse and high-quality human-motion videos. Furthermore, our model can facilitate other downstream tasks, such as pose sequence prediction and 2D-3D motion lifting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24026v1-abstract-full').style.display = 'none'; document.getElementById('2503.24026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://humandreamer.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23990">arXiv:2503.23990</a> <span> [<a href="https://arxiv.org/pdf/2503.23990">pdf</a>, <a href="https://arxiv.org/format/2503.23990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BeMERC: Behavior-Aware MLLM-based Framework for Multimodal Emotion Recognition in Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yumeng Fu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junjie Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongjie Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meishan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yulin Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingquan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23990v1-abstract-short" style="display: inline;"> Multimodal emotion recognition in conversation (MERC), the task of identifying the emotion label for each utterance in a conversation, is vital for developing empathetic machines. Current MLLM-based MERC studies focus mainly on capturing the speaker's textual or vocal characteristics, but ignore the significance of video-derived behavior information. Different from text and audio inputs, learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23990v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23990v1-abstract-full" style="display: none;"> Multimodal emotion recognition in conversation (MERC), the task of identifying the emotion label for each utterance in a conversation, is vital for developing empathetic machines. Current MLLM-based MERC studies focus mainly on capturing the speaker's textual or vocal characteristics, but ignore the significance of video-derived behavior information. Different from text and audio inputs, learning videos with rich facial expression, body language and posture, provides emotion trigger signals to the models for more accurate emotion predictions. In this paper, we propose a novel behavior-aware MLLM-based framework (BeMERC) to incorporate speaker's behaviors, including subtle facial micro-expression, body language and posture, into a vanilla MLLM-based MERC model, thereby facilitating the modeling of emotional dynamics during a conversation. Furthermore, BeMERC adopts a two-stage instruction tuning strategy to extend the model to the conversations scenario for end-to-end training of a MERC predictor. Experiments demonstrate that BeMERC achieves superior performance than the state-of-the-art methods on two benchmark datasets, and also provides a detailed discussion on the significance of video-derived behavior information in MERC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23990v1-abstract-full').style.display = 'none'; document.getElementById('2503.23990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23829">arXiv:2503.23829</a> <span> [<a href="https://arxiv.org/pdf/2503.23829">pdf</a>, <a href="https://arxiv.org/format/2503.23829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Expanding RL with Verifiable Rewards Across Diverse Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dian Yu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linfeng Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juntao Li</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+H">Haitao Mi</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23829v1-abstract-short" style="display: inline;"> Reinforcement learning (RL) with verifiable rewards (RLVR) has shown promising results in mathematical reasoning and coding tasks where well-structured reference answers are available. However, its applicability to broader domains remains underexplored. In this work, we study the extension of RLVR to more diverse domains such as medicine, chemistry, psychology, and economics. We observe high agree… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23829v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23829v1-abstract-full" style="display: none;"> Reinforcement learning (RL) with verifiable rewards (RLVR) has shown promising results in mathematical reasoning and coding tasks where well-structured reference answers are available. However, its applicability to broader domains remains underexplored. In this work, we study the extension of RLVR to more diverse domains such as medicine, chemistry, psychology, and economics. We observe high agreement in binary judgments across different large language models (LLMs) when objective reference answers exist, which challenges the necessity of large-scale annotation for training domain-specific reward models. To address the limitations of binary rewards when handling unstructured reference answers, we further incorporate model-based soft scoring into RLVR to improve its flexibility. Our experiments show that a distilled generative reward model can serve as an effective cross-domain verifier, providing reliable reward signals for RL without requiring domain-specific annotations. By fine-tuning a base 7B model using various RL algorithms against our reward model, we obtain policies that outperform state-of-the-art open-source aligned LLMs such as Qwen2.5-72B-Instruct and DeepSeek-R1-Distill-Qwen-32B by a large margin, across domains in free-form answer settings. This also strengthens RLVR's robustness and scalability, highlighting its potential for real-world applications with noisy or weak labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23829v1-abstract-full').style.display = 'none'; document.getElementById('2503.23829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23512">arXiv:2503.23512</a> <span> [<a href="https://arxiv.org/pdf/2503.23512">pdf</a>, <a href="https://arxiv.org/format/2503.23512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SCORE: Story Coherence and Retrieval Enhancement for AI Narratives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+Q">Qiang Yi</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xinyuan Song</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+S">Shiyao Qian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Li Sun</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23512v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel at generating creative narratives but struggle with long-term coherence and emotional consistency in complex stories. To address this, we propose SCORE (Story Coherence and Retrieval Enhancement), a framework integrating three components: 1) Dynamic State Tracking (monitoring objects/characters via symbolic logic), 2) Context-Aware Summarization (hierarchical epi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23512v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23512v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel at generating creative narratives but struggle with long-term coherence and emotional consistency in complex stories. To address this, we propose SCORE (Story Coherence and Retrieval Enhancement), a framework integrating three components: 1) Dynamic State Tracking (monitoring objects/characters via symbolic logic), 2) Context-Aware Summarization (hierarchical episode summaries for temporal progression), and 3) Hybrid Retrieval (combining TF-IDF keyword relevance with cosine similarity-based semantic embeddings). The system employs a temporally-aligned Retrieval-Augmented Generation (RAG) pipeline to validate contextual consistency. Evaluations show SCORE achieves 23.6% higher coherence (NCI-2.0 benchmark), 89.7% emotional consistency (EASM metric), and 41.8% fewer hallucinations versus baseline GPT models. Its modular design supports incremental knowledge graph construction for persistent story memory and multi-LLM backend compatibility, offering an explainable solution for industrial-scale narrative systems requiring long-term consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23512v1-abstract-full').style.display = 'none'; document.getElementById('2503.23512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23510">arXiv:2503.23510</a> <span> [<a href="https://arxiv.org/pdf/2503.23510">pdf</a>, <a href="https://arxiv.org/format/2503.23510">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Demystifying Private Transactions and Their Impact in PoW and PoS Ethereum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xingyu Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengya Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianyu Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqian Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiqiang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23510v1-abstract-short" style="display: inline;"> In Ethereum, private transactions, a specialized transaction type employed to evade public Peer-to-Peer (P2P) network broadcasting, remain largely unexplored, particularly in the context of the transition from Proof-of-Work (PoW) to Proof-of-Stake (PoS) consensus mechanisms. To address this gap, we investigate the transaction characteristics, (un)intended usages, and monetary impacts by analyzing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23510v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23510v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23510v1-abstract-full" style="display: none;"> In Ethereum, private transactions, a specialized transaction type employed to evade public Peer-to-Peer (P2P) network broadcasting, remain largely unexplored, particularly in the context of the transition from Proof-of-Work (PoW) to Proof-of-Stake (PoS) consensus mechanisms. To address this gap, we investigate the transaction characteristics, (un)intended usages, and monetary impacts by analyzing large-scale datasets comprising 14,810,392 private transactions within a 15.5-month PoW dataset and 30,062,232 private transactions within a 15.5-month PoS dataset. While originally designed for security purposes, we find that private transactions predominantly serve three distinct functions in both PoW and PoS Ethereum: extracting Maximum Extractable Value (MEV), facilitating monetary transfers to distribute mining rewards, and interacting with popular Decentralized Finance (DeFi) applications. Furthermore, we find that private transactions are utilized in DeFi attacks to circumvent surveillance by white hat monitors, with an increased prevalence observed in PoS Ethereum compared to PoW Ethereum. Additionally, in PoS Ethereum, there is a subtle uptick in the role of private transactions for MEV extraction. This shift could be attributed to the decrease in transaction costs. However, this reduction in transaction cost and the cancellation of block rewards result in a significant decrease in mining profits for block creators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23510v1-abstract-full').style.display = 'none'; document.getElementById('2503.23510v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23358">arXiv:2503.23358</a> <span> [<a href="https://arxiv.org/pdf/2503.23358">pdf</a>, <a href="https://arxiv.org/format/2503.23358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Graph-Structured Driven Dual Adaptation for Mitigating Popularity Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+M">Miaomiao Cai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zhiyong Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23358v1-abstract-short" style="display: inline;"> Popularity bias challenges recommender systems by causing uneven recommendation performance and amplifying the Matthew effect. Limited user-item interactions confine unpopular items within embedding neighborhoods of few users, leading to representation collapse and reduced model generalization. Existing supervised alignment and reweighting methods mitigate this bias but have key limitations: (1) i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23358v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23358v1-abstract-full" style="display: none;"> Popularity bias challenges recommender systems by causing uneven recommendation performance and amplifying the Matthew effect. Limited user-item interactions confine unpopular items within embedding neighborhoods of few users, leading to representation collapse and reduced model generalization. Existing supervised alignment and reweighting methods mitigate this bias but have key limitations: (1) ignoring inherent variability across Graph Convolutional Networks (GCNs) layers, causing negative effects in deeper layers; (2) reliance on fixed hyperparameters to balance item popularity, restricting adaptability and increasing complexity. To address these issues, we propose the Graph-Structured Dual Adaptation Framework (GSDA). Our theoretical analysis identifies a crucial limitation of supervised alignment methods caused by over-smoothing in GCNs. As GCN layers deepen, popular and unpopular items increasingly lose distinctiveness, quantified by reduced conditional entropy. This diminished distinctiveness weakens supervised alignment effectiveness in mitigating popularity bias. Motivated by this, GSDA captures structural and distribution characteristics from the adjacency matrix through a dual adaptive strategy. First, a hierarchical adaptive alignment mechanism uses the adjacency matrix's Frobenius norm for layer-specific weight decay, countering conditional entropy reduction effects at deeper layers. Second, a distribution-aware dynamic contrast weighting strategy, guided by a real-time Gini coefficient, removes dependence on fixed hyperparameters, enabling adaptability to diverse data. Experiments on three benchmark datasets demonstrate GSDA significantly alleviates popularity bias and consistently outperforms state-of-the-art recommendation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23358v1-abstract-full').style.display = 'none'; document.getElementById('2503.23358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23327">arXiv:2503.23327</a> <span> [<a href="https://arxiv.org/pdf/2503.23327">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> AI Delivers Creative Output but Struggles with Thinking Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Man Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yang Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yijia Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wenxin Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huiqing Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingbai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23327v1-abstract-short" style="display: inline;"> A key objective in artificial intelligence (AI) development is to create systems that match or surpass human creativity. Although current AI models perform well across diverse creative tasks, it remains unclear whether these achievements reflect genuine creative thinking. This study examined whether AI models (GPT-3.5-turbo, GPT-4, and GPT-4o) engage in creative thinking by comparing their perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23327v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23327v1-abstract-full" style="display: none;"> A key objective in artificial intelligence (AI) development is to create systems that match or surpass human creativity. Although current AI models perform well across diverse creative tasks, it remains unclear whether these achievements reflect genuine creative thinking. This study examined whether AI models (GPT-3.5-turbo, GPT-4, and GPT-4o) engage in creative thinking by comparing their performance with humans across various creative tasks and core cognitive processes. Results showed that AI models outperformed humans in divergent thinking, convergent thinking, and insight problem-solving, but underperformed in creative writing. Compared to humans, AI generated lower forward flow values in both free and chain association tasks and showed lower accuracy in the representational change task. In creative evaluation, AI exhibited no significant correlation between the weights of novelty and appropriateness when predicting creative ratings, suggesting the absence of a human-like trade-off strategy. AI also had higher decision error scores in creative selection, suggesting difficulty identifying the most creative ideas. These findings suggest that while AI can mimic human creativity, its strong performance in creative tasks is likely driven by non-creative mechanisms rather than genuine creative thinking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23327v1-abstract-full').style.display = 'none'; document.getElementById('2503.23327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22764">arXiv:2503.22764</a> <span> [<a href="https://arxiv.org/pdf/2503.22764">pdf</a>, <a href="https://arxiv.org/format/2503.22764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Boosting Large Language Models with Mask Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yue Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qihua Dong</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yun Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22764v1-abstract-short" style="display: inline;"> The model is usually kept integral in the mainstream large language model (LLM) fine-tuning protocols. No works have questioned whether maintaining the integrity of the model is indispensable for performance. In this work, we introduce Mask Fine-Tuning (MFT), a brand-new LLM fine-tuning paradigm to show that properly breaking the integrity of the model can surprisingly lead to improved performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22764v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22764v1-abstract-full" style="display: none;"> The model is usually kept integral in the mainstream large language model (LLM) fine-tuning protocols. No works have questioned whether maintaining the integrity of the model is indispensable for performance. In this work, we introduce Mask Fine-Tuning (MFT), a brand-new LLM fine-tuning paradigm to show that properly breaking the integrity of the model can surprisingly lead to improved performance. Specifically, MFT learns a set of binary masks supervised by the typical LLM fine-tuning objective. Extensive experiments show that MFT gains a consistent performance boost across various domains and backbones (e.g., 1.95%/1.88% average gain in coding with LLaMA2-7B/3.1-8B). Detailed procedures are provided to study the proposed MFT from different hyperparameter perspectives for better insight. In particular, MFT naturally updates the current LLM training protocol by deploying it on a complete well-trained model. This study extends the functionality of mask learning from its conventional network pruning context for model compression to a more general scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22764v1-abstract-full').style.display = 'none'; document.getElementById('2503.22764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22081">arXiv:2503.22081</a> <span> [<a href="https://arxiv.org/pdf/2503.22081">pdf</a>, <a href="https://arxiv.org/format/2503.22081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Remote Sensing Foundation Models: From Vision to Multimodality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyue Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongxi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Q">Qiqi Zhan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuai Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenkai Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">YiMing Lei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeming Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunhong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22081v1-abstract-short" style="display: inline;"> The rapid advancement of remote sensing foundation models, particularly vision and multimodal models, has significantly enhanced the capabilities of intelligent geospatial data interpretation. These models combine various data modalities, such as optical, radar, and LiDAR imagery, with textual and geographic information, enabling more comprehensive analysis and understanding of remote sensing data… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22081v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22081v1-abstract-full" style="display: none;"> The rapid advancement of remote sensing foundation models, particularly vision and multimodal models, has significantly enhanced the capabilities of intelligent geospatial data interpretation. These models combine various data modalities, such as optical, radar, and LiDAR imagery, with textual and geographic information, enabling more comprehensive analysis and understanding of remote sensing data. The integration of multiple modalities allows for improved performance in tasks like object detection, land cover classification, and change detection, which are often challenged by the complex and heterogeneous nature of remote sensing data. However, despite these advancements, several challenges remain. The diversity in data types, the need for large-scale annotated datasets, and the complexity of multimodal fusion techniques pose significant obstacles to the effective deployment of these models. Moreover, the computational demands of training and fine-tuning multimodal models require significant resources, further complicating their practical application in remote sensing image interpretation tasks. This paper provides a comprehensive review of the state-of-the-art in vision and multimodal foundation models for remote sensing, focusing on their architecture, training methods, datasets and application scenarios. We discuss the key challenges these models face, such as data alignment, cross-modal transfer learning, and scalability, while also identifying emerging research directions aimed at overcoming these limitations. Our goal is to provide a clear understanding of the current landscape of remote sensing foundation models and inspire future research that can push the boundaries of what these models can achieve in real-world applications. The list of resources collected by the paper can be found in the https://github.com/IRIP-BUAA/A-Review-for-remote-sensing-vision-language-models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22081v1-abstract-full').style.display = 'none'; document.getElementById('2503.22081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21758">arXiv:2503.21758</a> <span> [<a href="https://arxiv.org/pdf/2503.21758">pdf</a>, <a href="https://arxiv.org/format/2503.21758">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lumina-Image 2.0: A Unified and Efficient Image Generative Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Q">Qi Qin</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yi Xin</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruoyi Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiting Lu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongyang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Manyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Beddow%2C+W">Will Beddow</a>, <a href="/search/cs?searchtype=author&query=Millon%2C+E">Erwann Millon</a>, <a href="/search/cs?searchtype=author&query=Perez%2C+V">Victor Perez</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhai Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21758v1-abstract-short" style="display: inline;"> We introduce Lumina-Image 2.0, an advanced text-to-image generation framework that achieves significant progress compared to previous work, Lumina-Next. Lumina-Image 2.0 is built upon two key principles: (1) Unification - it adopts a unified architecture (Unified Next-DiT) that treats text and image tokens as a joint sequence, enabling natural cross-modal interactions and allowing seamless task ex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21758v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21758v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21758v1-abstract-full" style="display: none;"> We introduce Lumina-Image 2.0, an advanced text-to-image generation framework that achieves significant progress compared to previous work, Lumina-Next. Lumina-Image 2.0 is built upon two key principles: (1) Unification - it adopts a unified architecture (Unified Next-DiT) that treats text and image tokens as a joint sequence, enabling natural cross-modal interactions and allowing seamless task expansion. Besides, since high-quality captioners can provide semantically well-aligned text-image training pairs, we introduce a unified captioning system, Unified Captioner (UniCap), specifically designed for T2I generation tasks. UniCap excels at generating comprehensive and accurate captions, accelerating convergence and enhancing prompt adherence. (2) Efficiency - to improve the efficiency of our proposed model, we develop multi-stage progressive training strategies and introduce inference acceleration techniques without compromising image quality. Extensive evaluations on academic benchmarks and public text-to-image arenas show that Lumina-Image 2.0 delivers strong performances even with only 2.6B parameters, highlighting its scalability and design efficiency. We have released our training details, code, and models at https://github.com/Alpha-VLLM/Lumina-Image-2.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21758v1-abstract-full').style.display = 'none'; document.getElementById('2503.21758v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report, 21 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21745">arXiv:2503.21745</a> <span> [<a href="https://arxiv.org/pdf/2503.21745">pdf</a>, <a href="https://arxiv.org/format/2503.21745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3DGen-Bench: Comprehensive Benchmark Suite for 3D Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tengfei Wang</a>, <a href="/search/cs?searchtype=author&query=Wetzstein%2C+G">Gordon Wetzstein</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21745v1-abstract-short" style="display: inline;"> 3D generation is experiencing rapid advancements, while the development of 3D evaluation has not kept pace. How to keep automatic evaluation equitably aligned with human perception has become a well-recognized challenge. Recent advances in the field of language and image generation have explored human preferences and showcased respectable fitting ability. However, the 3D domain still lacks such a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21745v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21745v1-abstract-full" style="display: none;"> 3D generation is experiencing rapid advancements, while the development of 3D evaluation has not kept pace. How to keep automatic evaluation equitably aligned with human perception has become a well-recognized challenge. Recent advances in the field of language and image generation have explored human preferences and showcased respectable fitting ability. However, the 3D domain still lacks such a comprehensive preference dataset over generative models. To mitigate this absence, we develop 3DGen-Arena, an integrated platform in a battle manner. Then, we carefully design diverse text and image prompts and leverage the arena platform to gather human preferences from both public users and expert annotators, resulting in a large-scale multi-dimension human preference dataset 3DGen-Bench. Using this dataset, we further train a CLIP-based scoring model, 3DGen-Score, and a MLLM-based automatic evaluator, 3DGen-Eval. These two models innovatively unify the quality evaluation of text-to-3D and image-to-3D generation, and jointly form our automated evaluation system with their respective strengths. Extensive experiments demonstrate the efficacy of our scoring model in predicting human preferences, exhibiting a superior correlation with human ranks compared to existing metrics. We believe that our 3DGen-Bench dataset and automated evaluation system will foster a more equitable evaluation in the field of 3D generation, further promoting the development of 3D generative models and their downstream applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21745v1-abstract-full').style.display = 'none'; document.getElementById('2503.21745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21460">arXiv:2503.21460</a> <span> [<a href="https://arxiv.org/pdf/2503.21460">pdf</a>, <a href="https://arxiv.org/format/2503.21460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Agent: A Survey on Methodology, Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yusheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yiyang Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Binqi Chen</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Z">Ziyue Qiao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qingqing Long</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rongcheng Tu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Meng Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenwu Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shichang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yiqiao Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanqing Zhao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21460v1-abstract-short" style="display: inline;"> The era of intelligent agents is upon us, driven by revolutionary advancements in large language models. Large Language Model (LLM) agents, with goal-driven behaviors and dynamic adaptation capabilities, potentially represent a critical pathway toward artificial general intelligence. This survey systematically deconstructs LLM agent systems through a methodology-centered taxonomy, linking architec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21460v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21460v1-abstract-full" style="display: none;"> The era of intelligent agents is upon us, driven by revolutionary advancements in large language models. Large Language Model (LLM) agents, with goal-driven behaviors and dynamic adaptation capabilities, potentially represent a critical pathway toward artificial general intelligence. This survey systematically deconstructs LLM agent systems through a methodology-centered taxonomy, linking architectural foundations, collaboration mechanisms, and evolutionary pathways. We unify fragmented research threads by revealing fundamental connections between agent design principles and their emergent behaviors in complex environments. Our work provides a unified architectural perspective, examining how agents are constructed, how they collaborate, and how they evolve over time, while also addressing evaluation methodologies, tool applications, practical challenges, and diverse application domains. By surveying the latest developments in this rapidly evolving field, we offer researchers a structured taxonomy for understanding LLM agents and identify promising directions for future research. The collection is available at https://github.com/luo-junyu/Awesome-Agent-Papers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21460v1-abstract-full').style.display = 'none'; document.getElementById('2503.21460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">329 papers surveyed, resources are at https://github.com/luo-junyu/Awesome-Agent-Papers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20349">arXiv:2503.20349</a> <span> [<a href="https://arxiv.org/pdf/2503.20349">pdf</a>, <a href="https://arxiv.org/format/2503.20349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Consistency Trajectory Matching for One-Step Generative Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+W">Weiyi You</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Leheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xingyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kexuan Shi</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+S">Shuhang Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20349v2-abstract-short" style="display: inline;"> Current diffusion-based super-resolution (SR) approaches achieve commendable performance at the cost of high inference overhead. Therefore, distillation techniques are utilized to accelerate the multi-step teacher model into one-step student model. Nevertheless, these methods significantly raise training costs and constrain the performance of the student model by the teacher model. To overcome the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20349v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20349v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20349v2-abstract-full" style="display: none;"> Current diffusion-based super-resolution (SR) approaches achieve commendable performance at the cost of high inference overhead. Therefore, distillation techniques are utilized to accelerate the multi-step teacher model into one-step student model. Nevertheless, these methods significantly raise training costs and constrain the performance of the student model by the teacher model. To overcome these tough challenges, we propose Consistency Trajectory Matching for Super-Resolution (CTMSR), a distillation-free strategy that is able to generate photo-realistic SR results in one step. Concretely, we first formulate a Probability Flow Ordinary Differential Equation (PF-ODE) trajectory to establish a deterministic mapping from low-resolution (LR) images with noise to high-resolution (HR) images. Then we apply the Consistency Training (CT) strategy to directly learn the mapping in one step, eliminating the necessity of pre-trained diffusion model. To further enhance the performance and better leverage the ground-truth during the training process, we aim to align the distribution of SR results more closely with that of the natural images. To this end, we propose to minimize the discrepancy between their respective PF-ODE trajectories from the LR image distribution by our meticulously designed Distribution Trajectory Matching (DTM) loss, resulting in improved realism of our recovered HR images. Comprehensive experimental results demonstrate that the proposed methods can attain comparable or even superior capabilities on both synthetic and real datasets while maintaining minimal inference latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20349v2-abstract-full').style.display = 'none'; document.getElementById('2503.20349v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20232">arXiv:2503.20232</a> <span> [<a href="https://arxiv.org/pdf/2503.20232">pdf</a>, <a href="https://arxiv.org/format/2503.20232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Learnable Sequence Augmenter for Triplet Contrastive Learning in Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujie Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jianli Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Moyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pengjie Ren</a>, <a href="/search/cs?searchtype=author&query=Ben%2C+X">Xianye Ben</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yujun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20232v1-abstract-short" style="display: inline;"> Most existing contrastive learning-based sequential recommendation (SR) methods rely on random operations (e.g., crop, reorder, and substitute) to generate augmented sequences. These methods often struggle to create positive sample pairs that closely resemble the representations of the raw sequences, potentially disrupting item correlations by deleting key items or introducing noisy iterac, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20232v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20232v1-abstract-full" style="display: none;"> Most existing contrastive learning-based sequential recommendation (SR) methods rely on random operations (e.g., crop, reorder, and substitute) to generate augmented sequences. These methods often struggle to create positive sample pairs that closely resemble the representations of the raw sequences, potentially disrupting item correlations by deleting key items or introducing noisy iterac, which misguides the contrastive learning process. To address this limitation, we propose Learnable sequence Augmentor for triplet Contrastive Learning in sequential Recommendation (LACLRec). Specifically, the self-supervised learning-based augmenter can automatically delete noisy items from sequences and insert new items that better capture item transition patterns, generating a higher-quality augmented sequence. Subsequently, we randomly generate another augmented sequence and design a ranking-based triplet contrastive loss to differentiate the similarities between the raw sequence, the augmented sequence from augmenter, and the randomly augmented sequence, providing more fine-grained contrastive signals. Extensive experiments on three real-world datasets demonstrate that both the sequence augmenter and the triplet contrast contribute to improving recommendation accuracy. LACLRec significantly outperforms the baseline model CL4SRec, and demonstrates superior performance compared to several state-of-the-art sequential recommendation algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20232v1-abstract-full').style.display = 'none'; document.getElementById('2503.20232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20205">arXiv:2503.20205</a> <span> [<a href="https://arxiv.org/pdf/2503.20205">pdf</a>, <a href="https://arxiv.org/format/2503.20205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalized Phase Pressure Control Enhanced Reinforcement Learning for Traffic Signal Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+X">Xiao-Cheng Liao</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Y">Yi Mei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang-Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20205v1-abstract-short" style="display: inline;"> Appropriate traffic state representation is crucial for learning traffic signal control policies. However, most of the current traffic state representations are heuristically designed, with insufficient theoretical support. In this paper, we (1) develop a flexible, efficient, and theoretically grounded method, namely generalized phase pressure (G2P) control, which takes only simple lane features i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20205v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20205v1-abstract-full" style="display: none;"> Appropriate traffic state representation is crucial for learning traffic signal control policies. However, most of the current traffic state representations are heuristically designed, with insufficient theoretical support. In this paper, we (1) develop a flexible, efficient, and theoretically grounded method, namely generalized phase pressure (G2P) control, which takes only simple lane features into consideration to decide which phase to be actuated; 2) extend the pressure control theory to a general form for multi-homogeneous-lane road networks based on queueing theory; (3) design a new traffic state representation based on the generalized phase state features from G2P control; and 4) develop a reinforcement learning (RL)-based algorithm template named G2P-XLight, and two RL algorithms, G2P-MPLight and G2P-CoLight, by combining the generalized phase state representation with MPLight and CoLight, two well-performed RL methods for learning traffic signal control policies. Extensive experiments conducted on multiple real-world datasets demonstrate that G2P control outperforms the state-of-the-art (SOTA) heuristic method in the transportation field and other recent human-designed heuristic methods; and that the newly proposed G2P-XLight significantly outperforms SOTA learning-based approaches. Our code is available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20205v1-abstract-full').style.display = 'none'; document.getElementById('2503.20205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19801">arXiv:2503.19801</a> <span> [<a href="https://arxiv.org/pdf/2503.19801">pdf</a>, <a href="https://arxiv.org/format/2503.19801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SeLIP: Similarity Enhanced Contrastive Language Image Pretraining for Multi-modal Head MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hanyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huiying Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wen Shen</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+C">Chao Chai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuang Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19801v1-abstract-short" style="display: inline;"> Despite that deep learning (DL) methods have presented tremendous potential in many medical image analysis tasks, the practical applications of medical DL models are limited due to the lack of enough data samples with manual annotations. By noting that the clinical radiology examinations are associated with radiology reports that describe the images, we propose to develop a foundation model for mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19801v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19801v1-abstract-full" style="display: none;"> Despite that deep learning (DL) methods have presented tremendous potential in many medical image analysis tasks, the practical applications of medical DL models are limited due to the lack of enough data samples with manual annotations. By noting that the clinical radiology examinations are associated with radiology reports that describe the images, we propose to develop a foundation model for multi-model head MRI by using contrastive learning on the images and the corresponding radiology findings. In particular, a contrastive learning framework is proposed, where a mixed syntax and semantic similarity matching metric is integrated to reduce the thirst of extreme large dataset in conventional contrastive learning framework. Our proposed similarity enhanced contrastive language image pretraining (SeLIP) is able to effectively extract more useful features. Experiments revealed that our proposed SeLIP performs well in many downstream tasks including image-text retrieval task, classification task, and image segmentation, which highlights the importance of considering the similarities among texts describing different images in developing medical image foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19801v1-abstract-full').style.display = 'none'; document.getElementById('2503.19801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19455">arXiv:2503.19455</a> <span> [<a href="https://arxiv.org/pdf/2503.19455">pdf</a>, <a href="https://arxiv.org/format/2503.19455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Data-centric Federated Graph Learning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+B">Bo Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huabin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengmei Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chuan Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19455v1-abstract-short" style="display: inline;"> In federated graph learning (FGL), a complete graph is divided into multiple subgraphs stored in each client due to privacy concerns, and all clients jointly train a global graph model by only transmitting model parameters. A pain point of FGL is the heterogeneity problem, where nodes or structures present non-IID properties among clients (e.g., different node label distributions), dramatically un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19455v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19455v1-abstract-full" style="display: none;"> In federated graph learning (FGL), a complete graph is divided into multiple subgraphs stored in each client due to privacy concerns, and all clients jointly train a global graph model by only transmitting model parameters. A pain point of FGL is the heterogeneity problem, where nodes or structures present non-IID properties among clients (e.g., different node label distributions), dramatically undermining the convergence and performance of FGL. To address this, existing efforts focus on design strategies at the model level, i.e., they design models to extract common knowledge to mitigate heterogeneity. However, these model-level strategies fail to fundamentally address the heterogeneity problem as the model needs to be designed from scratch when transferring to other tasks. Motivated by large language models (LLMs) having achieved remarkable success, we aim to utilize LLMs to fully understand and augment local text-attributed graphs, to address data heterogeneity at the data level. In this paper, we propose a general framework LLM4FGL that innovatively decomposes the task of LLM for FGL into two sub-tasks theoretically. Specifically, for each client, it first utilizes the LLM to generate missing neighbors and then infers connections between generated nodes and raw nodes. To improve the quality of generated nodes, we design a novel federated generation-and-reflection mechanism for LLMs, without the need to modify the parameters of the LLM but relying solely on the collective feedback from all clients. After neighbor generation, all the clients utilize a pre-trained edge predictor to infer the missing edges. Furthermore, our framework can seamlessly integrate as a plug-in with existing FGL methods. Experiments on three real-world datasets demonstrate the superiority of our method compared to advanced baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19455v1-abstract-full').style.display = 'none'; document.getElementById('2503.19455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ongoing work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19271">arXiv:2503.19271</a> <span> [<a href="https://arxiv.org/pdf/2503.19271">pdf</a>, <a href="https://arxiv.org/format/2503.19271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MARS: Memory-Enhanced Agents with Reflective Self-improvement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xuechen Liang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+M">Meiling Tao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yinghui Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yijin Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingsong Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuantao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19271v1-abstract-short" style="display: inline;"> Large language models (LLMs) have made significant advances in the field of natural language processing, but they still face challenges such as continuous decision-making, lack of long-term memory, and limited context windows in dynamic environments. To address these issues, this paper proposes an innovative framework Memory-Enhanced Agents with Reflective Self-improvement. The MARS framework comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19271v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19271v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19271v1-abstract-full" style="display: none;"> Large language models (LLMs) have made significant advances in the field of natural language processing, but they still face challenges such as continuous decision-making, lack of long-term memory, and limited context windows in dynamic environments. To address these issues, this paper proposes an innovative framework Memory-Enhanced Agents with Reflective Self-improvement. The MARS framework comprises three agents: the User, the Assistant, and the Checker. By integrating iterative feedback, reflective mechanisms, and a memory optimization mechanism based on the Ebbinghaus forgetting curve, it significantly enhances the agents capabilities in handling multi-tasking and long-span information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19271v1-abstract-full').style.display = 'none'; document.getElementById('2503.19271v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18891">arXiv:2503.18891</a> <span> [<a href="https://arxiv.org/pdf/2503.18891">pdf</a>, <a href="https://arxiv.org/format/2503.18891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AgentDropout: Dynamic Agent Elimination for Token-Efficient and High-Performance LLM-Based Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhexuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yutong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Liang Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18891v1-abstract-short" style="display: inline;"> Multi-agent systems (MAS) based on large language models (LLMs) have demonstrated significant potential in collaborative problem-solving. However, they still face substantial challenges of low communication efficiency and suboptimal task performance, making the careful design of the agents' communication topologies particularly important. Inspired by the management theory that roles in an efficien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18891v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18891v1-abstract-full" style="display: none;"> Multi-agent systems (MAS) based on large language models (LLMs) have demonstrated significant potential in collaborative problem-solving. However, they still face substantial challenges of low communication efficiency and suboptimal task performance, making the careful design of the agents' communication topologies particularly important. Inspired by the management theory that roles in an efficient team are often dynamically adjusted, we propose AgentDropout, which identifies redundant agents and communication across different communication rounds by optimizing the adjacency matrices of the communication graphs and eliminates them to enhance both token efficiency and task performance. Compared to state-of-the-art methods, AgentDropout achieves an average reduction of 21.6% in prompt token consumption and 18.4% in completion token consumption, along with a performance improvement of 1.14 on the tasks. Furthermore, the extended experiments demonstrate that AgentDropout achieves notable domain transferability and structure robustness, revealing its reliability and effectiveness. We release our code at https://github.com/wangzx1219/AgentDropout. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18891v1-abstract-full').style.display = 'none'; document.getElementById('2503.18891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18872">arXiv:2503.18872</a> <span> [<a href="https://arxiv.org/pdf/2503.18872">pdf</a>, <a href="https://arxiv.org/format/2503.18872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Curriculum Coarse-to-Fine Selection for High-IPC Dataset Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanda Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gongwei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+W">Weili Guan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18872v1-abstract-short" style="display: inline;"> Dataset distillation (DD) excels in synthesizing a small number of images per class (IPC) but struggles to maintain its effectiveness in high-IPC settings. Recent works on dataset distillation demonstrate that combining distilled and real data can mitigate the effectiveness decay. However, our analysis of the combination paradigm reveals that the current one-shot and independent selection mechanis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18872v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18872v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18872v1-abstract-full" style="display: none;"> Dataset distillation (DD) excels in synthesizing a small number of images per class (IPC) but struggles to maintain its effectiveness in high-IPC settings. Recent works on dataset distillation demonstrate that combining distilled and real data can mitigate the effectiveness decay. However, our analysis of the combination paradigm reveals that the current one-shot and independent selection mechanism induces an incompatibility issue between distilled and real images. To address this issue, we introduce a novel curriculum coarse-to-fine selection (CCFS) method for efficient high-IPC dataset distillation. CCFS employs a curriculum selection framework for real data selection, where we leverage a coarse-to-fine strategy to select appropriate real data based on the current synthetic dataset in each curriculum. Extensive experiments validate CCFS, surpassing the state-of-the-art by +6.6\% on CIFAR-10, +5.8\% on CIFAR-100, and +3.4\% on Tiny-ImageNet under high-IPC settings. Notably, CCFS achieves 60.2\% test accuracy on ResNet-18 with a 20\% compression ratio of Tiny-ImageNet, closely matching full-dataset training with only 0.3\% degradation. Code: https://github.com/CYDaaa30/CCFS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18872v1-abstract-full').style.display = 'none'; document.getElementById('2503.18872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18454">arXiv:2503.18454</a> <span> [<a href="https://arxiv.org/pdf/2503.18454">pdf</a>, <a href="https://arxiv.org/format/2503.18454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InPO: Inversion Preference Optimization with Reparametrized DDIM for Efficient Diffusion Model Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yunhong Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qichao Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hengyuan Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xierui Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaoyin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18454v1-abstract-short" style="display: inline;"> Without using explicit reward, direct preference optimization (DPO) employs paired human preference data to fine-tune generative models, a method that has garnered considerable attention in large language models (LLMs). However, exploration of aligning text-to-image (T2I) diffusion models with human preferences remains limited. In comparison to supervised fine-tuning, existing methods that align d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18454v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18454v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18454v1-abstract-full" style="display: none;"> Without using explicit reward, direct preference optimization (DPO) employs paired human preference data to fine-tune generative models, a method that has garnered considerable attention in large language models (LLMs). However, exploration of aligning text-to-image (T2I) diffusion models with human preferences remains limited. In comparison to supervised fine-tuning, existing methods that align diffusion model suffer from low training efficiency and subpar generation quality due to the long Markov chain process and the intractability of the reverse process. To address these limitations, we introduce DDIM-InPO, an efficient method for direct preference alignment of diffusion models. Our approach conceptualizes diffusion model as a single-step generative model, allowing us to fine-tune the outputs of specific latent variables selectively. In order to accomplish this objective, we first assign implicit rewards to any latent variable directly via a reparameterization technique. Then we construct an Inversion technique to estimate appropriate latent variables for preference optimization. This modification process enables the diffusion model to only fine-tune the outputs of latent variables that have a strong correlation with the preference dataset. Experimental results indicate that our DDIM-InPO achieves state-of-the-art performance with just 400 steps of fine-tuning, surpassing all preference aligning baselines for T2I diffusion models in human preference evaluation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18454v1-abstract-full').style.display = 'none'; document.getElementById('2503.18454v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17669">arXiv:2503.17669</a> <span> [<a href="https://arxiv.org/pdf/2503.17669">pdf</a>, <a href="https://arxiv.org/format/2503.17669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TDRI: Two-Phase Dialogue Refinement and Co-Adaptation for Interactive Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuheng Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sida Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Haoyue Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17669v1-abstract-short" style="display: inline;"> Although text-to-image generation technologies have made significant advancements, they still face challenges when dealing with ambiguous prompts and aligning outputs with user intent.Our proposed framework, TDRI (Two-Phase Dialogue Refinement and Co-Adaptation), addresses these issues by enhancing image generation through iterative user interaction. It consists of two phases: the Initial Generati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17669v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17669v1-abstract-full" style="display: none;"> Although text-to-image generation technologies have made significant advancements, they still face challenges when dealing with ambiguous prompts and aligning outputs with user intent.Our proposed framework, TDRI (Two-Phase Dialogue Refinement and Co-Adaptation), addresses these issues by enhancing image generation through iterative user interaction. It consists of two phases: the Initial Generation Phase, which creates base images based on user prompts, and the Interactive Refinement Phase, which integrates user feedback through three key modules. The Dialogue-to-Prompt (D2P) module ensures that user feedback is effectively transformed into actionable prompts, which improves the alignment between user intent and model input. By evaluating generated outputs against user expectations, the Feedback-Reflection (FR) module identifies discrepancies and facilitates improvements. In an effort to ensure consistently high-quality results, the Adaptive Optimization (AO) module fine-tunes the generation process by balancing user preferences and maintaining prompt fidelity. Experimental results show that TDRI outperforms existing methods by achieving 33.6% human preference, compared to 6.2% for GPT-4 augmentation, and the highest CLIP and BLIP alignment scores (0.338 and 0.336, respectively). In iterative feedback tasks, user satisfaction increased to 88% after 8 rounds, with diminishing returns beyond 6 rounds. Furthermore, TDRI has been found to reduce the number of iterations and improve personalization in the creation of fashion products. TDRI exhibits a strong potential for a wide range of applications in the creative and industrial domains, as it streamlines the creative process and improves alignment with user preferences <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17669v1-abstract-full').style.display = 'none'; document.getElementById('2503.17669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17660">arXiv:2503.17660</a> <span> [<a href="https://arxiv.org/pdf/2503.17660">pdf</a>, <a href="https://arxiv.org/format/2503.17660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OMR-Diffusion:Optimizing Multi-Round Enhanced Training in Diffusion Models for Improved Intent Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17660v1-abstract-short" style="display: inline;"> Generative AI has significantly advanced text-driven image generation, but it still faces challenges in producing outputs that consistently align with evolving user preferences and intents, particularly in multi-turn dialogue scenarios. In this research, We present a Visual Co-Adaptation (VCA) framework that incorporates human-in-the-loop feedback, utilizing a well-trained reward model specificall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17660v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17660v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17660v1-abstract-full" style="display: none;"> Generative AI has significantly advanced text-driven image generation, but it still faces challenges in producing outputs that consistently align with evolving user preferences and intents, particularly in multi-turn dialogue scenarios. In this research, We present a Visual Co-Adaptation (VCA) framework that incorporates human-in-the-loop feedback, utilizing a well-trained reward model specifically designed to closely align with human preferences. Using a diverse multi-turn dialogue dataset, the framework applies multiple reward functions (such as diversity, consistency, and preference feedback) to refine the diffusion model through LoRA, effectively optimizing image generation based on user input. We also constructed multi-round dialogue datasets with prompts and image pairs that well-fit user intent. Experiments show the model achieves 508 wins in human evaluation, outperforming DALL-E 3 (463 wins) and others. It also achieves 3.4 rounds in dialogue efficiency (vs. 13.7 for DALL-E 3) and excels in metrics like LPIPS (0.15) and BLIP (0.59). Various experiments demonstrate the effectiveness of the proposed method over state-of-the-art baselines, with significant improvements in image consistency and alignment with user intent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17660v1-abstract-full').style.display = 'none'; document.getElementById('2503.17660v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17641">arXiv:2503.17641</a> <span> [<a href="https://arxiv.org/pdf/2503.17641">pdf</a>, <a href="https://arxiv.org/format/2503.17641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InstructVEdit: A Holistic Approach for Instructional Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengjian Feng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F">Feng Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yujie Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17641v1-abstract-short" style="display: inline;"> Video editing according to instructions is a highly challenging task due to the difficulty in collecting large-scale, high-quality edited video pair data. This scarcity not only limits the availability of training data but also hinders the systematic exploration of model architectures and training strategies. While prior work has improved specific aspects of video editing (e.g., synthesizing a vid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17641v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17641v1-abstract-full" style="display: none;"> Video editing according to instructions is a highly challenging task due to the difficulty in collecting large-scale, high-quality edited video pair data. This scarcity not only limits the availability of training data but also hinders the systematic exploration of model architectures and training strategies. While prior work has improved specific aspects of video editing (e.g., synthesizing a video dataset using image editing techniques or decomposed video editing training), a holistic framework addressing the above challenges remains underexplored. In this study, we introduce InstructVEdit, a full-cycle instructional video editing approach that: (1) establishes a reliable dataset curation workflow to initialize training, (2) incorporates two model architectural improvements to enhance edit quality while preserving temporal consistency, and (3) proposes an iterative refinement strategy leveraging real-world data to enhance generalization and minimize train-test discrepancies. Extensive experiments show that InstructVEdit achieves state-of-the-art performance in instruction-based video editing, demonstrating robust adaptability to diverse real-world scenarios. Project page: https://o937-blip.github.io/InstructVEdit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17641v1-abstract-full').style.display = 'none'; document.getElementById('2503.17641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://o937-blip.github.io/InstructVEdit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17343">arXiv:2503.17343</a> <span> [<a href="https://arxiv.org/pdf/2503.17343">pdf</a>, <a href="https://arxiv.org/format/2503.17343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Commercial Dishes Can Be My Ladder: Sustainable and Collaborative Data Offloading in LEO Satellite Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+Y+C">Yi Ching Chou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hengzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaoyi Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiangchuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17343v1-abstract-short" style="display: inline;"> Low Earth Orbit (LEO) satellite networks, characterized by their high data throughput and low latency, have gained significant interest from both industry and academia. Routing data efficiently within these networks is essential for maintaining a high quality of service. However, current routing strategies, such as bent-pipe and inter-satellite link (ISL) routing, have their unique challenges. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17343v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17343v1-abstract-full" style="display: none;"> Low Earth Orbit (LEO) satellite networks, characterized by their high data throughput and low latency, have gained significant interest from both industry and academia. Routing data efficiently within these networks is essential for maintaining a high quality of service. However, current routing strategies, such as bent-pipe and inter-satellite link (ISL) routing, have their unique challenges. The bent-pipe strategy requires a dense deployment of dedicated ground stations, while the ISL-based strategy can negatively impact satellite battery lifespan due to increased traffic load, leading to sustainability issues. In this paper, we propose sustainable collaborative offloading, a framework that orchestrates groups of existing commercial resources like ground stations and 5G base stations for data offloading. This orchestration enhances total capacity, overcoming the limitations of a single resource. We propose the collaborator group set construction algorithm to construct candidate groups and the collaborator selection and total payment algorithm to select offloading targets and determine payments no less than the costs. Extensive real-world-based simulations show that our solution significantly improves energy consumption, satellite service life, and end-to-end latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17343v1-abstract-full').style.display = 'none'; document.getElementById('2503.17343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a preliminary extended version of the paper accepted to INFOCOM 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17181">arXiv:2503.17181</a> <span> [<a href="https://arxiv.org/pdf/2503.17181">pdf</a>, <a href="https://arxiv.org/format/2503.17181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLMs Love Python: A Study of LLMs' Bias for Programming Languages and Libraries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Twist%2C+L">Lukas Twist</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J+M">Jie M. Zhang</a>, <a href="/search/cs?searchtype=author&query=Harman%2C+M">Mark Harman</a>, <a href="/search/cs?searchtype=author&query=Syme%2C+D">Don Syme</a>, <a href="/search/cs?searchtype=author&query=Noppen%2C+J">Joost Noppen</a>, <a href="/search/cs?searchtype=author&query=Nauck%2C+D">Detlef Nauck</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17181v1-abstract-short" style="display: inline;"> Programming language and library choices are crucial to software reliability and security. Poor or inconsistent choices can lead to increased technical debt, security vulnerabilities, and even catastrophic failures in safety-critical systems. As Large Language Models (LLMs) play an increasing role in code generation, it is essential to understand how they make these decisions. However, little is k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17181v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17181v1-abstract-full" style="display: none;"> Programming language and library choices are crucial to software reliability and security. Poor or inconsistent choices can lead to increased technical debt, security vulnerabilities, and even catastrophic failures in safety-critical systems. As Large Language Models (LLMs) play an increasing role in code generation, it is essential to understand how they make these decisions. However, little is known about their preferences when selecting programming languages and libraries for different coding tasks. To fill this gap, this study provides the first in-depth investigation into LLM preferences for programming languages and libraries used when generating code. We assess the preferences of eight diverse LLMs by prompting them to complete various coding tasks, including widely-studied benchmarks and the more practical task of generating the initial structural code for new projects (a crucial step that often determines a project's language or library choices). Our findings reveal that LLMs heavily favour Python when solving language-agnostic problems, using it in 90%-97% of cases for benchmark tasks. Even when generating initial project code where Python is not a suitable language, it remains the most-used language in 58% of instances. Moreover, LLMs contradict their own language recommendations in 83% of project initialisation tasks, raising concerns about their reliability in guiding language selection. Similar biases toward well-established libraries further create serious discoverability challenges for newer open-source projects. These results highlight the need to improve LLMs' adaptability to diverse programming contexts and to develop mechanisms for mitigating programming language and library bias. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17181v1-abstract-full').style.display = 'none'; document.getElementById('2503.17181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17162">arXiv:2503.17162</a> <span> [<a href="https://arxiv.org/pdf/2503.17162">pdf</a>, <a href="https://arxiv.org/format/2503.17162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoRLD: Contrastive Representation Learning Of Deformable Shapes In Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hossain%2C+T">Tonmoy Hossain</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miaomiao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17162v2-abstract-short" style="display: inline;"> Deformable shape representations, parameterized by deformations relative to a given template, have proven effective for improved image analysis tasks. However, their broader applicability is hindered by two major challenges. First, existing methods mainly rely on a known template during testing, which is impractical and limits flexibility. Second, they often struggle to capture fine-grained, voxel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17162v2-abstract-full').style.display = 'inline'; document.getElementById('2503.17162v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17162v2-abstract-full" style="display: none;"> Deformable shape representations, parameterized by deformations relative to a given template, have proven effective for improved image analysis tasks. However, their broader applicability is hindered by two major challenges. First, existing methods mainly rely on a known template during testing, which is impractical and limits flexibility. Second, they often struggle to capture fine-grained, voxel-level distinctions between similar shapes (e.g., anatomical variations among healthy individuals, those with mild cognitive impairment, and diseased states). To address these limitations, we propose a novel framework - Contrastive Representation Learning of Deformable shapes (CoRLD) in learned deformation spaces and demonstrate its effectiveness in the context of image classification. Our CoRLD leverages a class-aware contrastive supervised learning objective in latent deformation spaces, promoting proximity among representations of similar classes while ensuring separation of dissimilar groups. In contrast to previous deep learning networks that require a reference image as input to predict deformation changes, our approach eliminates this dependency. Instead, template images are utilized solely as ground truth in the loss function during the training process, making our model more flexible and generalizable to a wide range of medical applications. We validate CoRLD on diverse datasets, including real brain magnetic resonance imaging (MRIs) and adrenal shapes derived from computed tomography (CT) scans. Experimental results show that our model effectively extracts deformable shape features, which can be easily integrated with existing classifiers to substantially boost the classification accuracy. Our code is available at GitHub. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17162v2-abstract-full').style.display = 'none'; document.getElementById('2503.17162v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17014">arXiv:2503.17014</a> <span> [<a href="https://arxiv.org/pdf/2503.17014">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Behavioral Conflict Avoidance Between Humans and Quadruped Robots in Shared Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shuang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yun Gan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Deqing Huang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lei Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenguang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17014v1-abstract-short" style="display: inline;"> Nowadays, robots are increasingly operated in environments shared with humans, where conflicts between human and robot behaviors may compromise safety. This paper presents a proactive behavioral conflict avoidance framework based on the principle of adaptation to trends for quadruped robots that not only ensures the robot's safety but also minimizes interference with human activities. It can proac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17014v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17014v1-abstract-full" style="display: none;"> Nowadays, robots are increasingly operated in environments shared with humans, where conflicts between human and robot behaviors may compromise safety. This paper presents a proactive behavioral conflict avoidance framework based on the principle of adaptation to trends for quadruped robots that not only ensures the robot's safety but also minimizes interference with human activities. It can proactively avoid potential conflicts with approaching humans or other dynamic objects, whether the robot is stationary or in motion, then swiftly resume its tasks once the conflict subsides. An enhanced approach is proposed to achieve precise human detection and tracking on vibratory robot platform equipped with low-cost hybrid solid-state LiDAR. When potential conflict detected, the robot selects an avoidance point and executes an evasion maneuver before resuming its task. This approach contrasts with conventional methods that remain goal-driven, often resulting in aggressive behaviors, such as forcibly bypassing obstacles and causing conflicts or becoming stuck in deadlock scenarios. The selection of avoidance points is achieved by integrating static and dynamic obstacle to generate a potential field map. The robot then searches for feasible regions within this map and determines the optimal avoidance point using an evaluation function. Experimental results demonstrate that the framework significantly reduces interference with human activities, enhances the safety of both robots and persons. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17014v1-abstract-full').style.display = 'none'; document.getElementById('2503.17014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 9 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17005">arXiv:2503.17005</a> <span> [<a href="https://arxiv.org/pdf/2503.17005">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Exploration-Based Precise Mapping for Mobile Robots through Stepwise and Consistent Motions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lei Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Ying Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+K">Kai Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yongkui Sun</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+H">Henry Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17005v1-abstract-short" style="display: inline;"> This paper presents an autonomous exploration framework. It is designed for indoor ground mobile robots that utilize laser Simultaneous Localization and Mapping (SLAM), ensuring process completeness and precise mapping results. For frontier search, the local-global sampling architecture based on multiple Rapidly Exploring Random Trees (RRTs) is employed. Traversability checks during RRT expansion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17005v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17005v1-abstract-full" style="display: none;"> This paper presents an autonomous exploration framework. It is designed for indoor ground mobile robots that utilize laser Simultaneous Localization and Mapping (SLAM), ensuring process completeness and precise mapping results. For frontier search, the local-global sampling architecture based on multiple Rapidly Exploring Random Trees (RRTs) is employed. Traversability checks during RRT expansion and global RRT pruning upon map updates eliminate unreachable frontiers, reducing potential collisions and deadlocks. Adaptive sampling density adjustments, informed by obstacle distribution, enhance exploration coverage potential. For frontier point navigation, a stepwise consistent motion strategy is adopted, wherein the robot strictly drives straight on approximately equidistant line segments in the polyline path and rotates in place at segment junctions. This simplified, decoupled motion pattern improves scan-matching stability and mitigates map drift. For process control, the framework serializes frontier point selection and navigation, avoiding oscillation caused by frequent goal changes in conventional parallelized processes. The waypoint retracing mechanism is introduced to generate repeated observations, triggering loop closure detection and backend optimization in graph-based SLAM, thereby improving map consistency and precision. Experiments in both simulation and real-world scenarios validate the effectiveness of the framework. It achieves improved mapping coverage and precision in more challenging environments compared to baseline 2D exploration algorithms. It also shows robustness in supporting resource-constrained robot platforms and maintaining mapping consistency across various LiDAR field-of-view (FoV) configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17005v1-abstract-full').style.display = 'none'; document.getElementById('2503.17005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 11 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16444">arXiv:2503.16444</a> <span> [<a href="https://arxiv.org/pdf/2503.16444">pdf</a>, <a href="https://arxiv.org/format/2503.16444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Conversational Explanations: Discussing Explainable AI with Non-AI Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengao Zhang</a>, <a href="/search/cs?searchtype=author&query=Low%2C+W+Y">Wei Yan Low</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X+J">X. Jessie Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16444v1-abstract-short" style="display: inline;"> Explainable AI (XAI) aims to provide insights into the decisions made by AI models. To date, most XAI approaches provide only one-time, static explanations, which cannot cater to users' diverse knowledge levels and information needs. Conversational explanations have been proposed as an effective method to customize XAI explanations. However, building conversational explanation systems is hindered… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16444v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16444v1-abstract-full" style="display: none;"> Explainable AI (XAI) aims to provide insights into the decisions made by AI models. To date, most XAI approaches provide only one-time, static explanations, which cannot cater to users' diverse knowledge levels and information needs. Conversational explanations have been proposed as an effective method to customize XAI explanations. However, building conversational explanation systems is hindered by the scarcity of training data. Training with synthetic data faces two main challenges: lack of data diversity and hallucination in the generated data. To alleviate these issues, we introduce a repetition penalty to promote data diversity and exploit a hallucination detector to filter out untruthful synthetic conversation turns. We conducted both automatic and human evaluations on the proposed system, fEw-shot Multi-round ConvErsational Explanation (EMCEE). For automatic evaluation, EMCEE achieves relative improvements of 81.6% in BLEU and 80.5% in ROUGE compared to the baselines. EMCEE also mitigates the degeneration of data quality caused by training on synthetic data. In human evaluations (N=60), EMCEE outperforms baseline models and the control group in improving users' comprehension, acceptance, trust, and collaboration with static explanations by large margins. Through a fine-grained analysis of model responses, we further demonstrate that training on self-generated synthetic data improves the model's ability to generate more truthful and understandable answers, leading to better user interactions. To the best of our knowledge, this is the first conversational explanation method that can answer free-form user questions following static explanations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16444v1-abstract-full').style.display = 'none'; document.getElementById('2503.16444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IUI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15916">arXiv:2503.15916</a> <span> [<a href="https://arxiv.org/pdf/2503.15916">pdf</a>, <a href="https://arxiv.org/format/2503.15916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> ALLMod: Exploring $\underline{\mathbf{A}}$rea-Efficiency of $\underline{\mathbf{L}}$UT-based $\underline{\mathbf{L}}$arge Number $\underline{\mathbf{Mod}}$ular Reduction via Hybrid Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fangxin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haomin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongwu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shoumeng Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Li Jiang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+H">Haibing Guan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15916v1-abstract-short" style="display: inline;"> Modular arithmetic, particularly modular reduction, is widely used in cryptographic applications such as homomorphic encryption (HE) and zero-knowledge proofs (ZKP). High-bit-width operations are crucial for enhancing security; however, they are computationally intensive due to the large number of modular operations required. The lookup-table-based (LUT-based) approach, a ``space-for-time'' techni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15916v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15916v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15916v1-abstract-full" style="display: none;"> Modular arithmetic, particularly modular reduction, is widely used in cryptographic applications such as homomorphic encryption (HE) and zero-knowledge proofs (ZKP). High-bit-width operations are crucial for enhancing security; however, they are computationally intensive due to the large number of modular operations required. The lookup-table-based (LUT-based) approach, a ``space-for-time'' technique, reduces computational load by segmenting the input number into smaller bit groups, pre-computing modular reduction results for each segment, and storing these results in LUTs. While effective, this method incurs significant hardware overhead due to extensive LUT usage. In this paper, we introduce ALLMod, a novel approach that improves the area efficiency of LUT-based large-number modular reduction by employing hybrid workloads. Inspired by the iterative method, ALLMod splits the bit groups into two distinct workloads, achieving lower area costs without compromising throughput. We first develop a template to facilitate workload splitting and ensure balanced distribution. Then, we conduct design space exploration to evaluate the optimal timing for fusing workload results, enabling us to identify the most efficient design under specific constraints. Extensive evaluations show that ALLMod achieves up to $1.65\times$ and $3\times$ improvements in area efficiency over conventional LUT-based methods for bit-widths of $128$ and $8,192$, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15916v1-abstract-full').style.display = 'none'; document.getElementById('2503.15916v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 62nd Design Automation Conference ($\bf{DAC\ 2025}$)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15197">arXiv:2503.15197</a> <span> [<a href="https://arxiv.org/pdf/2503.15197">pdf</a>, <a href="https://arxiv.org/format/2503.15197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detect-and-Guide: Self-regulation of Diffusion Models for Safe Text-to-Image Generation via Guideline Token Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+F">Feifei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiming Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15197v1-abstract-short" style="display: inline;"> Text-to-image diffusion models have achieved state-of-the-art results in synthesis tasks; however, there is a growing concern about their potential misuse in creating harmful content. To mitigate these risks, post-hoc model intervention techniques, such as concept unlearning and safety guidance, have been developed. However, fine-tuning model weights or adapting the hidden states of the diffusion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15197v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15197v1-abstract-full" style="display: none;"> Text-to-image diffusion models have achieved state-of-the-art results in synthesis tasks; however, there is a growing concern about their potential misuse in creating harmful content. To mitigate these risks, post-hoc model intervention techniques, such as concept unlearning and safety guidance, have been developed. However, fine-tuning model weights or adapting the hidden states of the diffusion model operates in an uninterpretable way, making it unclear which part of the intermediate variables is responsible for unsafe generation. These interventions severely affect the sampling trajectory when erasing harmful concepts from complex, multi-concept prompts, thus hindering their practical use in real-world settings. In this work, we propose the safe generation framework Detect-and-Guide (DAG), leveraging the internal knowledge of diffusion models to perform self-diagnosis and fine-grained self-regulation during the sampling process. DAG first detects harmful concepts from noisy latents using refined cross-attention maps of optimized tokens, then applies safety guidance with adaptive strength and editing regions to negate unsafe generation. The optimization only requires a small annotated dataset and can provide precise detection maps with generalizability and concept specificity. Moreover, DAG does not require fine-tuning of diffusion models, and therefore introduces no loss to their generation diversity. Experiments on erasing sexual content show that DAG achieves state-of-the-art safe generation performance, balancing harmfulness mitigation and text-following performance on multi-concept real-world prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15197v1-abstract-full').style.display = 'none'; document.getElementById('2503.15197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14845">arXiv:2503.14845</a> <span> [<a href="https://arxiv.org/pdf/2503.14845">pdf</a>, <a href="https://arxiv.org/format/2503.14845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ClimateGS: Real-Time Climate Simulation with 3D Gaussian Style Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuezhen Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meiying Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qi Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14845v1-abstract-short" style="display: inline;"> Adverse climate conditions pose significant challenges for autonomous systems, demanding reliable perception and decision-making across diverse environments. To better simulate these conditions, physically-based NeRF rendering methods have been explored for their ability to generate realistic scene representations. However, these methods suffer from slow rendering speeds and long preprocessing tim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14845v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14845v1-abstract-full" style="display: none;"> Adverse climate conditions pose significant challenges for autonomous systems, demanding reliable perception and decision-making across diverse environments. To better simulate these conditions, physically-based NeRF rendering methods have been explored for their ability to generate realistic scene representations. However, these methods suffer from slow rendering speeds and long preprocessing times, making them impractical for real-time testing and user interaction. This paper presents ClimateGS, a novel framework integrating 3D Gaussian representations with physical simulation to enable real-time climate effects rendering. The novelty of this work is threefold: 1) developing a linear transformation for 3D Gaussian photorealistic style transfer, enabling direct modification of spherical harmonics across bands for efficient and consistent style adaptation; 2) developing a joint training strategy for 3D style transfer, combining supervised and self-supervised learning to accelerate convergence while preserving original scene details; 3) developing a real-time rendering method for climate simulation, integrating physics-based effects with 3D Gaussian to achieve efficient and realistic rendering. We evaluate ClimateGS on MipNeRF360 and Tanks and Temples, demonstrating real-time rendering with comparable or superior visual quality to SOTA 2D/3D methods, making it suitable for interactive applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14845v1-abstract-full').style.display = 'none'; document.getElementById('2503.14845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14837">arXiv:2503.14837</a> <span> [<a href="https://arxiv.org/pdf/2503.14837">pdf</a>, <a href="https://arxiv.org/format/2503.14837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SemanticFlow: A Self-Supervised Framework for Joint Scene Flow Prediction and Instance Segmentation in Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yinqi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meiying Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qi Hao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+G">Guang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14837v1-abstract-short" style="display: inline;"> Accurate perception of dynamic traffic scenes is crucial for high-level autonomous driving systems, requiring robust object motion estimation and instance segmentation. However, traditional methods often treat them as separate tasks, leading to suboptimal performance, spatio-temporal inconsistencies, and inefficiency in complex scenarios due to the absence of information sharing. This paper propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14837v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14837v1-abstract-full" style="display: none;"> Accurate perception of dynamic traffic scenes is crucial for high-level autonomous driving systems, requiring robust object motion estimation and instance segmentation. However, traditional methods often treat them as separate tasks, leading to suboptimal performance, spatio-temporal inconsistencies, and inefficiency in complex scenarios due to the absence of information sharing. This paper proposes a multi-task SemanticFlow framework to simultaneously predict scene flow and instance segmentation of full-resolution point clouds. The novelty of this work is threefold: 1) developing a coarse-to-fine prediction based multi-task scheme, where an initial coarse segmentation of static backgrounds and dynamic objects is used to provide contextual information for refining motion and semantic information through a shared feature processing module; 2) developing a set of loss functions to enhance the performance of scene flow estimation and instance segmentation, while can help ensure spatial and temporal consistency of both static and dynamic objects within traffic scenes; 3) developing a self-supervised learning scheme, which utilizes coarse segmentation to detect rigid objects and compute their transformation matrices between sequential frames, enabling the generation of self-supervised labels. The proposed framework is validated on the Argoverse and Waymo datasets, demonstrating superior performance in instance segmentation accuracy, scene flow estimation, and computational efficiency, establishing a new benchmark for self-supervised methods in dynamic scene understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14837v1-abstract-full').style.display = 'none'; document.getElementById('2503.14837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14476">arXiv:2503.14476</a> <span> [<a href="https://arxiv.org/pdf/2503.14476">pdf</a>, <a href="https://arxiv.org/format/2503.14476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DAPO: An Open-Source LLM Reinforcement Learning System at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qiying Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruofei Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yufeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xiaochen Zuo</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yu Yue</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tiantian Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjun Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haibin Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiqi Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bole Ma</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+G">Guangming Sheng</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yuxuan Tong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hang Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaze Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiangjie Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongli Yu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Weinan Dai</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14476v1-abstract-short" style="display: inline;"> Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the $\textbf{D}$ecouple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14476v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14476v1-abstract-full" style="display: none;"> Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the $\textbf{D}$ecoupled Clip and $\textbf{D}$ynamic s$\textbf{A}$mpling $\textbf{P}$olicy $\textbf{O}$ptimization ($\textbf{DAPO}$) algorithm, and fully open-source a state-of-the-art large-scale RL system that achieves 50 points on AIME 2024 using Qwen2.5-32B base model. Unlike previous works that withhold training details, we introduce four key techniques of our algorithm that make large-scale LLM RL a success. In addition, we open-source our training code, which is built on the verl framework, along with a carefully curated and processed dataset. These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14476v1-abstract-full').style.display = 'none'; document.getElementById('2503.14476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://dapo-sia.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13882">arXiv:2503.13882</a> <span> [<a href="https://arxiv.org/pdf/2503.13882">pdf</a>, <a href="https://arxiv.org/format/2503.13882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MoK-RAG: Mixture of Knowledge Paths Enhanced Retrieval-Augmented Generation for Embodied AI Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhengsheng Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Linwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyang Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xuefeng Bai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13882v1-abstract-short" style="display: inline;"> While human cognition inherently retrieves information from diverse and specialized knowledge sources during decision-making processes, current Retrieval-Augmented Generation (RAG) systems typically operate through single-source knowledge retrieval, leading to a cognitive-algorithmic discrepancy. To bridge this gap, we introduce MoK-RAG, a novel multi-source RAG framework that implements a mixture… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13882v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13882v1-abstract-full" style="display: none;"> While human cognition inherently retrieves information from diverse and specialized knowledge sources during decision-making processes, current Retrieval-Augmented Generation (RAG) systems typically operate through single-source knowledge retrieval, leading to a cognitive-algorithmic discrepancy. To bridge this gap, we introduce MoK-RAG, a novel multi-source RAG framework that implements a mixture of knowledge paths enhanced retrieval mechanism through functional partitioning of a large language model (LLM) corpus into distinct sections, enabling retrieval from multiple specialized knowledge paths. Applied to the generation of 3D simulated environments, our proposed MoK-RAG3D enhances this paradigm by partitioning 3D assets into distinct sections and organizing them based on a hierarchical knowledge tree structure. Different from previous methods that only use manual evaluation, we pioneered the introduction of automated evaluation methods for 3D scenes. Both automatic and human evaluations in our experiments demonstrate that MoK-RAG3D can assist Embodied AI agents in generating diverse scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13882v1-abstract-full').style.display = 'none'; document.getElementById('2503.13882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13822">arXiv:2503.13822</a> <span> [<a href="https://arxiv.org/pdf/2503.13822">pdf</a>, <a href="https://arxiv.org/format/2503.13822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> NeurBench: Benchmarking Learned Database Components with Data and Workload Drift Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhanhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Haotian Gao</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+N">Naili Xing</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+L">Lingze Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Meihui Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&query=Rigger%2C+M">Manuel Rigger</a>, <a href="/search/cs?searchtype=author&query=Ooi%2C+B+C">Beng Chin Ooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13822v2-abstract-short" style="display: inline;"> Learned database components, which deeply integrate machine learning into their design, have been extensively studied in recent years. Given the dynamism of databases, where data and workloads continuously drift, it is crucial for learned database components to remain effective and efficient in the face of data and workload drift. Adaptability, therefore, is a key factor in assessing their practic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13822v2-abstract-full').style.display = 'inline'; document.getElementById('2503.13822v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13822v2-abstract-full" style="display: none;"> Learned database components, which deeply integrate machine learning into their design, have been extensively studied in recent years. Given the dynamism of databases, where data and workloads continuously drift, it is crucial for learned database components to remain effective and efficient in the face of data and workload drift. Adaptability, therefore, is a key factor in assessing their practical applicability. However, existing benchmarks for learned database components either overlook or oversimplify the treatment of data and workload drift, failing to evaluate learned database components across a broad range of drift scenarios. This paper presents NeurBench, a new benchmark suite that applies measurable and controllable data and workload drift to enable systematic performance evaluations of learned database components. We quantify diverse types of drift by introducing a key concept called the drift factor. Building on this formulation, we propose a drift-aware data and workload generation framework that effectively simulates real-world drift while preserving inherent correlations. We employ NeurBench to evaluate state-of-the-art learned query optimizers, learned indexes, and learned concurrency control within a consistent experimental process, providing insights into their performance under diverse data and workload drift scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13822v2-abstract-full').style.display = 'none'; document.getElementById('2503.13822v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13269">arXiv:2503.13269</a> <span> [<a href="https://arxiv.org/pdf/2503.13269">pdf</a>, <a href="https://arxiv.org/format/2503.13269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> DAgent: A Relational Database-Driven Data Analysis Report Generation Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenyi Xu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yuren Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaolu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xuemei Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yunjun Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13269v1-abstract-short" style="display: inline;"> Relational database-driven data analysis (RDB-DA) report generation, which aims to generate data analysis reports after querying relational databases, has been widely applied in fields such as finance and healthcare. Typically, these tasks are manually completed by data scientists, making the process very labor-intensive and showing a clear need for automation. Although existing methods (e.g., Tab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13269v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13269v1-abstract-full" style="display: none;"> Relational database-driven data analysis (RDB-DA) report generation, which aims to generate data analysis reports after querying relational databases, has been widely applied in fields such as finance and healthcare. Typically, these tasks are manually completed by data scientists, making the process very labor-intensive and showing a clear need for automation. Although existing methods (e.g., Table QA or Text-to-SQL) have been proposed to reduce human dependency, they cannot handle complex analytical tasks that require multi-step reasoning, cross-table associations, and synthesizing insights into reports. Moreover, there is no dataset available for developing automatic RDB-DA report generation. To fill this gap, this paper proposes an LLM agent system for RDB-DA report generation tasks, dubbed DAgent; moreover, we construct a benchmark for automatic data analysis report generation, which includes a new dataset DA-Dataset and evaluation metrics. DAgent integrates planning, tools, and memory modules to decompose natural language questions into logically independent sub-queries, accurately retrieve key information from relational databases, and generate analytical reports that meet the requirements of completeness, correctness, and conciseness through multi-step reasoning and effective data integration. Experimental analysis on the DA-Dataset demonstrates that DAgent's superiority in retrieval performance and analysis report generation quality, showcasing its strong potential for tackling complex database analysis report generation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13269v1-abstract-full').style.display = 'none'; document.getElementById('2503.13269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12340">arXiv:2503.12340</a> <span> [<a href="https://arxiv.org/pdf/2503.12340">pdf</a>, <a href="https://arxiv.org/format/2503.12340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SVD-LLM V2: Optimizing Singular Value Truncation for Large Language Model Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Alam%2C+S">Samiul Alam</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhongwei Wan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12340v1-abstract-short" style="display: inline;"> Despite significant advancements, the practical deployment of Large Language Models (LLMs) is often hampered by their immense sizes, highlighting the need for effective compression techniques. Singular Value Decomposition (SVD) is a promising LLM compression technique. However, existing SVD-based compression methods fall short in reducing truncation losses, leading to less competitive performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12340v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12340v1-abstract-full" style="display: none;"> Despite significant advancements, the practical deployment of Large Language Models (LLMs) is often hampered by their immense sizes, highlighting the need for effective compression techniques. Singular Value Decomposition (SVD) is a promising LLM compression technique. However, existing SVD-based compression methods fall short in reducing truncation losses, leading to less competitive performance in compressed models. In this work, we introduce SVD-LLM V2, a SVD-based LLM compression method that optimizes singular value truncation in SVD compression with two techniques. First, SVD-LLM V2 proposes to use theoretical truncation loss of weight matrices to assign a unique compression ratio to each weight matrix at different layers to accommodate weight redundancy heterogeneity. Second, SVD-LLM V2 proposes loss-optimized weight truncation to ensure that the truncated singular values result in a lower and more stable truncation loss in practice. We evaluate SVD-LLM V2 on ten datasets and five LLMs at various scales. Our results show SVD-LLM V2 outperforms state-of-the-art SVD-based LLM compression methods. Our code is available at https://github.com/AIoT-MLSys-Lab/SVD-LLM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12340v1-abstract-full').style.display = 'none'; document.getElementById('2503.12340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025; Code available at https://github.com/AIoT-MLSys-Lab/SVD-LLM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12152">arXiv:2503.12152</a> <span> [<a href="https://arxiv.org/pdf/2503.12152">pdf</a>, <a href="https://arxiv.org/format/2503.12152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving LLM-based Document-level Machine Translation with Multi-Knowledge Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+X">Xinglin Lyu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junhui Li</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Shimin Tao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12152v1-abstract-short" style="display: inline;"> Recent studies in prompting large language model (LLM) for document-level machine translation (DMT) primarily focus on the inter-sentence context by flatting the source document into a long sequence. This approach relies solely on the sequence of sentences within the document. However, the complexity of document-level sequences is greater than that of shorter sentence-level sequences, which may li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12152v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12152v1-abstract-full" style="display: none;"> Recent studies in prompting large language model (LLM) for document-level machine translation (DMT) primarily focus on the inter-sentence context by flatting the source document into a long sequence. This approach relies solely on the sequence of sentences within the document. However, the complexity of document-level sequences is greater than that of shorter sentence-level sequences, which may limit LLM's ability in DMT when only this single-source knowledge is used. In this paper, we propose an enhanced approach by incorporating multiple sources of knowledge, including both the document summarization and entity translation, to enhance the performance of LLM-based DMT. Given a source document, we first obtain its summarization and translation of entities via LLM as the additional knowledge. We then utilize LLMs to generate two translations of the source document by fusing these two single knowledge sources, respectively. Finally, recognizing that different sources of knowledge may aid or hinder the translation of different sentences, we refine and rank the translations by leveraging a multi-knowledge fusion strategy to ensure the best results. Experimental results in eight document-level translation tasks show that our approach achieves an average improvement of 0.8, 0.6, and 0.4 COMET scores over the baseline without extra knowledge for LLaMA3-8B-Instruct, Mistral-Nemo-Instruct, and GPT-4o-mini, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12152v1-abstract-full').style.display = 'none'; document.getElementById('2503.12152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12014">arXiv:2503.12014</a> <span> [<a href="https://arxiv.org/pdf/2503.12014">pdf</a>, <a href="https://arxiv.org/format/2503.12014">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Dual-Domain Multi-Scale Representations for Single Image Deraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+S">Shun Zou</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yi Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingya Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shipeng Luo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+G">Guangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+G">Guojun Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12014v1-abstract-short" style="display: inline;"> Existing image deraining methods typically rely on single-input, single-output, and single-scale architectures, which overlook the joint multi-scale information between external and internal features. Furthermore, single-domain representations are often too restrictive, limiting their ability to handle the complexities of real-world rain scenarios. To address these challenges, we propose a novel D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12014v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12014v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12014v1-abstract-full" style="display: none;"> Existing image deraining methods typically rely on single-input, single-output, and single-scale architectures, which overlook the joint multi-scale information between external and internal features. Furthermore, single-domain representations are often too restrictive, limiting their ability to handle the complexities of real-world rain scenarios. To address these challenges, we propose a novel Dual-Domain Multi-Scale Representation Network (DMSR). The key idea is to exploit joint multi-scale representations from both external and internal domains in parallel while leveraging the strengths of both spatial and frequency domains to capture more comprehensive properties. Specifically, our method consists of two main components: the Multi-Scale Progressive Spatial Refinement Module (MPSRM) and the Frequency Domain Scale Mixer (FDSM). The MPSRM enables the interaction and coupling of multi-scale expert information within the internal domain using a hierarchical modulation and fusion strategy. The FDSM extracts multi-scale local information in the spatial domain, while also modeling global dependencies in the frequency domain. Extensive experiments show that our model achieves state-of-the-art performance across six benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12014v1-abstract-full').style.display = 'none'; document.getElementById('2503.12014v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures, code: https://zs1314.github.io/DMSR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11995">arXiv:2503.11995</a> <span> [<a href="https://arxiv.org/pdf/2503.11995">pdf</a>, <a href="https://arxiv.org/format/2503.11995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fraesormer: Learning Adaptive Sparse Transformer for Efficient Food Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+S">Shun Zou</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yi Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingya Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Shipeng Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhihao Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+G">Guangwei Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11995v1-abstract-short" style="display: inline;"> In recent years, Transformer has witnessed significant progress in food recognition. However, most existing approaches still face two critical challenges in lightweight food recognition: (1) the quadratic complexity and redundant feature representation from interactions with irrelevant tokens; (2) static feature recognition and single-scale representation, which overlook the unstructured, non-fixe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11995v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11995v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11995v1-abstract-full" style="display: none;"> In recent years, Transformer has witnessed significant progress in food recognition. However, most existing approaches still face two critical challenges in lightweight food recognition: (1) the quadratic complexity and redundant feature representation from interactions with irrelevant tokens; (2) static feature recognition and single-scale representation, which overlook the unstructured, non-fixed nature of food images and the need for multi-scale features. To address these, we propose an adaptive and efficient sparse Transformer architecture (Fraesormer) with two core designs: Adaptive Top-k Sparse Partial Attention (ATK-SPA) and Hierarchical Scale-Sensitive Feature Gating Network (HSSFGN). ATK-SPA uses a learnable Gated Dynamic Top-K Operator (GDTKO) to retain critical attention scores, filtering low query-key matches that hinder feature aggregation. It also introduces a partial channel mechanism to reduce redundancy and promote expert information flow, enabling local-global collaborative modeling. HSSFGN employs gating mechanism to achieve multi-scale feature representation, enhancing contextual semantic information. Extensive experiments show that Fraesormer outperforms state-of-the-art methods. code is available at https://zs1314.github.io/Fraesormer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11995v1-abstract-full').style.display = 'none'; document.getElementById('2503.11995v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11301">arXiv:2503.11301</a> <span> [<a href="https://arxiv.org/pdf/2503.11301">pdf</a>, <a href="https://arxiv.org/format/2503.11301">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> GNNs as Predictors of Agentic Workflow Performances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanshuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yuchen Hou</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bohan Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiaowen Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11301v1-abstract-short" style="display: inline;"> Agentic workflows invoked by Large Language Models (LLMs) have achieved remarkable success in handling complex tasks. However, optimizing such workflows is costly and inefficient in real-world applications due to extensive invocations of LLMs. To fill this gap, this position paper formulates agentic workflows as computational graphs and advocates Graph Neural Networks (GNNs) as efficient predictor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11301v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11301v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11301v1-abstract-full" style="display: none;"> Agentic workflows invoked by Large Language Models (LLMs) have achieved remarkable success in handling complex tasks. However, optimizing such workflows is costly and inefficient in real-world applications due to extensive invocations of LLMs. To fill this gap, this position paper formulates agentic workflows as computational graphs and advocates Graph Neural Networks (GNNs) as efficient predictors of agentic workflow performances, avoiding repeated LLM invocations for evaluation. To empirically ground this position, we construct FLORA-Bench, a unified platform for benchmarking GNNs for predicting agentic workflow performances. With extensive experiments, we arrive at the following conclusion: GNNs are simple yet effective predictors. This conclusion supports new applications of GNNs and a novel direction towards automating agentic workflow optimization. All codes, models, and data are available at https://github.com/youngsoul0731/Flora-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11301v1-abstract-full').style.display = 'none'; document.getElementById('2503.11301v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11082">arXiv:2503.11082</a> <span> [<a href="https://arxiv.org/pdf/2503.11082">pdf</a>, <a href="https://arxiv.org/format/2503.11082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> LLMs are Bug Replicators: An Empirical Study on LLMs' Capability in Completing Bug-prone Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+L">Liwei Guo</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Sixiang Ye</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zeyu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxia Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J+M">Jie M. Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11082v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable performance in code completion. However, the training data used to develop these models often contain a significant amount of buggy code. Yet, it remains unclear to what extent these buggy instances influence LLMs' performance when tackling bug-prone code completion tasks. To fill this gap, this paper presents the first empirical study eval… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11082v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11082v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable performance in code completion. However, the training data used to develop these models often contain a significant amount of buggy code. Yet, it remains unclear to what extent these buggy instances influence LLMs' performance when tackling bug-prone code completion tasks. To fill this gap, this paper presents the first empirical study evaluating the performance of LLMs in completing bug-prone code. Through extensive experiments on 7 LLMs and the Defects4J dataset, we analyze LLMs' accuracy, robustness, and limitations in this challenging context. Our experimental results show that completing bug-prone code is significantly more challenging for LLMs than completing normal code. Notably, in bug-prone tasks, the likelihood of LLMs generating correct code is nearly the same as generating buggy code, and it is substantially lower than in normal code completion tasks (e.g., 12.27% vs. 29.85% for GPT-4). To our surprise, 44.44% of the bugs LLMs make are completely identical to the pre-fix version, indicating that LLMs have been seriously biased by historical bugs when completing code. Additionally, we investigate the effectiveness of existing post-processing techniques and find that while they can improve consistency, they do not significantly reduce error rates in bug-prone code scenarios. Our research highlights the limitations of current LLMs in handling bug-prone code and underscores the need for improved models and post-processing strategies to enhance code completion accuracy in real-world development environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11082v1-abstract-full').style.display = 'none'; document.getElementById('2503.11082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10793">arXiv:2503.10793</a> <span> [<a href="https://arxiv.org/pdf/2503.10793">pdf</a>, <a href="https://arxiv.org/format/2503.10793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> HALURust: Exploiting Hallucinations of Large Language Models to Detect Vulnerabilities in Rust </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Han Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengtao Zhang</a>, <a href="/search/cs?searchtype=author&query=De+La+Rosa%2C+D">Dylan De La Rosa</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+H">Hafsa Ahmed</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weifeng Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dianxiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10793v1-abstract-short" style="display: inline;"> As an emerging programming language, Rust has rapidly gained popularity and recognition among developers due to its strong emphasis on safety. It employs a unique ownership system and safe concurrency practices to ensure robust safety. Despite these safeguards, security in Rust still presents challenges. Since 2018, 442 Rust-related vulnerabilities have been reported in real-world applications. Th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10793v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10793v1-abstract-full" style="display: none;"> As an emerging programming language, Rust has rapidly gained popularity and recognition among developers due to its strong emphasis on safety. It employs a unique ownership system and safe concurrency practices to ensure robust safety. Despite these safeguards, security in Rust still presents challenges. Since 2018, 442 Rust-related vulnerabilities have been reported in real-world applications. The limited availability of data has resulted in existing vulnerability detection tools performing poorly in real-world scenarios, often failing to adapt to new and complex vulnerabilities. This paper introduces HALURust, a novel framework that leverages hallucinations of large language models (LLMs) to detect vulnerabilities in real-world Rust scenarios. HALURust leverages LLMs' strength in natural language generation by transforming code into detailed vulnerability analysis reports. The key innovation lies in prompting the LLM to always assume the presence of a vulnerability. If the code sample is vulnerable, the LLM provides an accurate analysis; if not, it generates a hallucinated report. By fine-tuning LLMs on these hallucinations, HALURust can effectively distinguish between vulnerable and non-vulnerable code samples. HALURust was evaluated on a dataset of 81 real-world vulnerabilities, covering 447 functions and 18,691 lines of code across 54 applications. It outperformed existing methods, achieving an F1 score of 77.3%, with over 10% improvement. The hallucinated report-based fine-tuning improved detection by 20\% compared to traditional code-based fine-tuning. Additionally, HALURust effectively adapted to unseen vulnerabilities and other programming languages, demonstrating strong generalization capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10793v1-abstract-full').style.display = 'none'; document.getElementById('2503.10793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10546">arXiv:2503.10546</a> <span> [<a href="https://arxiv.org/pdf/2503.10546">pdf</a>, <a href="https://arxiv.org/format/2503.10546">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KUDA: Keypoints to Unify Dynamics Learning and Visual Prompting for Open-Vocabulary Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zixian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingtong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunzhu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10546v1-abstract-short" style="display: inline;"> With the rapid advancement of large language models (LLMs) and vision-language models (VLMs), significant progress has been made in developing open-vocabulary robotic manipulation systems. However, many existing approaches overlook the importance of object dynamics, limiting their applicability to more complex, dynamic tasks. In this work, we introduce KUDA, an open-vocabulary manipulation system… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10546v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10546v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10546v1-abstract-full" style="display: none;"> With the rapid advancement of large language models (LLMs) and vision-language models (VLMs), significant progress has been made in developing open-vocabulary robotic manipulation systems. However, many existing approaches overlook the importance of object dynamics, limiting their applicability to more complex, dynamic tasks. In this work, we introduce KUDA, an open-vocabulary manipulation system that integrates dynamics learning and visual prompting through keypoints, leveraging both VLMs and learning-based neural dynamics models. Our key insight is that a keypoint-based target specification is simultaneously interpretable by VLMs and can be efficiently translated into cost functions for model-based planning. Given language instructions and visual observations, KUDA first assigns keypoints to the RGB image and queries the VLM to generate target specifications. These abstract keypoint-based representations are then converted into cost functions, which are optimized using a learned dynamics model to produce robotic trajectories. We evaluate KUDA on a range of manipulation tasks, including free-form language instructions across diverse object categories, multi-object interactions, and deformable or granular objects, demonstrating the effectiveness of our framework. The project page is available at http://kuda-dynamics.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10546v1-abstract-full').style.display = 'none'; document.getElementById('2503.10546v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: http://kuda-dynamics.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10211">arXiv:2503.10211</a> <span> [<a href="https://arxiv.org/pdf/2503.10211">pdf</a>, <a href="https://arxiv.org/format/2503.10211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Inner Speech-Text Alignment for LLM-based Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Henglyu Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Andong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehai Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xuefeng Bai</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+M">Meizhi Zhong</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yuan Qiu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10211v1-abstract-short" style="display: inline;"> Recent advancement of large language models (LLMs) has led to significant breakthroughs across various tasks, laying the foundation for the development of LLM-based speech translation systems. Existing methods primarily focus on aligning inputs and outputs across modalities while overlooking deeper semantic alignment within model representations. To address this limitation, we propose an Adaptive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10211v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10211v1-abstract-full" style="display: none;"> Recent advancement of large language models (LLMs) has led to significant breakthroughs across various tasks, laying the foundation for the development of LLM-based speech translation systems. Existing methods primarily focus on aligning inputs and outputs across modalities while overlooking deeper semantic alignment within model representations. To address this limitation, we propose an Adaptive Inner Speech-Text Alignment (AI-STA) method to bridge the modality gap by explicitly aligning speech and text representations at selected layers within LLMs. To achieve this, we leverage the optimal transport (OT) theory to quantify fine-grained representation discrepancies between speech and text. Furthermore, we utilize the cross-modal retrieval technique to identify the layers that are best suited for alignment and perform joint training on these layers. Experimental results on speech translation (ST) tasks demonstrate that AI-STA significantly improves the translation performance of large speech-text models (LSMs), outperforming previous state-of-the-art approaches. Our findings highlight the importance of inner-layer speech-text alignment in LLMs and provide new insights into enhancing cross-modal learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10211v1-abstract-full').style.display = 'none'; document.getElementById('2503.10211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10093">arXiv:2503.10093</a> <span> [<a href="https://arxiv.org/pdf/2503.10093">pdf</a>, <a href="https://arxiv.org/format/2503.10093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Representation-based Reward Modeling for Efficient Safety Alignment of Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Q">Qiyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xuefeng Bai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehai Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+L">Liqiang Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10093v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) algorithms for safety alignment of Large Language Models (LLMs), such as Direct Preference Optimization (DPO), encounter the challenge of distribution shift. Current approaches typically address this issue through online sampling from the target policy, which requires significant computational resources. In this paper, we hypothesize that during off-policy training, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10093v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10093v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) algorithms for safety alignment of Large Language Models (LLMs), such as Direct Preference Optimization (DPO), encounter the challenge of distribution shift. Current approaches typically address this issue through online sampling from the target policy, which requires significant computational resources. In this paper, we hypothesize that during off-policy training, while the ranking order of output generated by policy changes, their overall distribution remains relatively stable. This stability allows the transformation of the sampling process from the target policy into a re-ranking of preference data. Building on this hypothesis, We propose a new framework that leverages the model's intrinsic safety judgment capability to extract reward signals, which are then used to calculate label confidence for preferences reordering. Extensive experimental results and theoretical analysis demonstrate that the proposed method effectively addresses the distribution shift issue, remarkably enhancing the safety performance while reducing about 300x computational overheads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10093v1-abstract-full').style.display = 'none'; document.getElementById('2503.10093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09958">arXiv:2503.09958</a> <span> [<a href="https://arxiv.org/pdf/2503.09958">pdf</a>, <a href="https://arxiv.org/format/2503.09958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Take Off the Training Wheels Progressive In-Context Learning for Effective Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongfang Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinshuo Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinping Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yibin Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Baotian Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09958v1-abstract-short" style="display: inline;"> Recent studies have explored the working mechanisms of In-Context Learning (ICL). However, they mainly focus on classification and simple generation tasks, limiting their broader application to more complex generation tasks in practice. To address this gap, we investigate the impact of demonstrations on token representations within the practical alignment tasks. We find that the transformer embeds… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09958v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09958v1-abstract-full" style="display: none;"> Recent studies have explored the working mechanisms of In-Context Learning (ICL). However, they mainly focus on classification and simple generation tasks, limiting their broader application to more complex generation tasks in practice. To address this gap, we investigate the impact of demonstrations on token representations within the practical alignment tasks. We find that the transformer embeds the task function learned from demonstrations into the separator token representation, which plays an important role in the generation of prior response tokens. Once the prior response tokens are determined, the demonstrations become redundant.Motivated by this finding, we propose an efficient Progressive In-Context Alignment (PICA) method consisting of two stages. In the first few-shot stage, the model generates several prior response tokens via standard ICL while concurrently extracting the ICL vector that stores the task function from the separator token representation. In the following zero-shot stage, this ICL vector guides the model to generate responses without further demonstrations.Extensive experiments demonstrate that our PICA not only surpasses vanilla ICL but also achieves comparable performance to other alignment tuning methods. The proposed training-free method reduces the time cost (e.g., 5.45+) with improved alignment performance (e.g., 6.57+). Consequently, our work highlights the application of ICL for alignment and calls for a deeper understanding of ICL for complex generations. The code will be available at https://github.com/HITsz-TMG/PICA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09958v1-abstract-full').style.display = 'none'; document.getElementById('2503.09958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 9 figures, published in EMNLP2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09712">arXiv:2503.09712</a> <span> [<a href="https://arxiv.org/pdf/2503.09712">pdf</a>, <a href="https://arxiv.org/format/2503.09712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714827">10.1145/3696410.3714827 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Revisiting Backdoor Attacks on Time Series Classification in the Frequency Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuanmin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09712v2-abstract-short" style="display: inline;"> Time series classification (TSC) is a cornerstone of modern web applications, powering tasks such as financial data analysis, network traffic monitoring, and user behavior analysis. In recent years, deep neural networks (DNNs) have greatly enhanced the performance of TSC models in these critical domains. However, DNNs are vulnerable to backdoor attacks, where attackers can covertly implant trigger… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09712v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09712v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09712v2-abstract-full" style="display: none;"> Time series classification (TSC) is a cornerstone of modern web applications, powering tasks such as financial data analysis, network traffic monitoring, and user behavior analysis. In recent years, deep neural networks (DNNs) have greatly enhanced the performance of TSC models in these critical domains. However, DNNs are vulnerable to backdoor attacks, where attackers can covertly implant triggers into models to induce malicious outcomes. Existing backdoor attacks targeting DNN-based TSC models remain elementary. In particular, early methods borrow trigger designs from computer vision, which are ineffective for time series data. More recent approaches utilize generative models for trigger generation, but at the cost of significant computational complexity. In this work, we analyze the limitations of existing attacks and introduce an enhanced method, FreqBack. Drawing inspiration from the fact that DNN models inherently capture frequency domain features in time series data, we identify that improper perturbations in the frequency domain are the root cause of ineffective attacks. To address this, we propose to generate triggers both effectively and efficiently, guided by frequency analysis. FreqBack exhibits substantial performance across five models and eight datasets, achieving an impressive attack success rate of over 90%, while maintaining less than a 3% drop in model accuracy on clean data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09712v2-abstract-full').style.display = 'none'; document.getElementById('2503.09712v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WWW 2025 (Oral)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+M&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>