CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 3,667 results for author: <span class="mathjax">Zhang, W</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+W">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, W"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+W&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, W"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21696">arXiv:2503.21696</a> <span> [<a href="https://arxiv.org/pdf/2503.21696">pdf</a>, <a href="https://arxiv.org/format/2503.21696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Embodied-Reasoner: Synergizing Visual Search, Reasoning, and Action for Embodied Interactive Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengna Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gangao Liu</a>, <a href="/search/cs?searchtype=author&query=Huixin%2C+X">Xu Huixin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yiwei Jiang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yongliang Shen</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+G">Guiyang Hou</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhe Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Weiming Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21696v1-abstract-short" style="display: inline;"> Recent advances in deep thinking models have demonstrated remarkable reasoning capabilities on mathematical and coding tasks. However, their effectiveness in embodied domains which require continuous interaction with environments through image action interleaved trajectories remains largely -unexplored. We present Embodied Reasoner, a model that extends o1 style reasoning to interactive embodied s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21696v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21696v1-abstract-full" style="display: none;"> Recent advances in deep thinking models have demonstrated remarkable reasoning capabilities on mathematical and coding tasks. However, their effectiveness in embodied domains which require continuous interaction with environments through image action interleaved trajectories remains largely -unexplored. We present Embodied Reasoner, a model that extends o1 style reasoning to interactive embodied search tasks. Unlike mathematical reasoning that relies primarily on logical deduction, embodied scenarios demand spatial understanding, temporal reasoning, and ongoing self-reflection based on interaction history. To address these challenges, we synthesize 9.3k coherent Observation-Thought-Action trajectories containing 64k interactive images and 90k diverse thinking processes (analysis, spatial reasoning, reflection, planning, and verification). We develop a three-stage training pipeline that progressively enhances the model's capabilities through imitation learning, self-exploration via rejection sampling, and self-correction through reflection tuning. The evaluation shows that our model significantly outperforms those advanced visual reasoning models, e.g., it exceeds OpenAI o1, o3-mini, and Claude-3.7 by +9\%, 24\%, and +13\%. Analysis reveals our model exhibits fewer repeated searches and logical inconsistencies, with particular advantages in complex long-horizon tasks. Real-world environments also show our superiority while exhibiting fewer repeated searches and logical inconsistency cases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21696v1-abstract-full').style.display = 'none'; document.getElementById('2503.21696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/zwq2018/embodied_reasoner Dataset: https://huggingface.co/datasets/zwq2018/embodied_reasoner</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21460">arXiv:2503.21460</a> <span> [<a href="https://arxiv.org/pdf/2503.21460">pdf</a>, <a href="https://arxiv.org/format/2503.21460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Agent: A Survey on Methodology, Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Ye Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yusheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yiyang Gu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bohan Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Binqi Chen</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Z">Ziyue Qiao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qingqing Long</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+R">Rongcheng Tu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Meng Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenwu Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shichang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yiqiao Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hanqing Zhao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21460v1-abstract-short" style="display: inline;"> The era of intelligent agents is upon us, driven by revolutionary advancements in large language models. Large Language Model (LLM) agents, with goal-driven behaviors and dynamic adaptation capabilities, potentially represent a critical pathway toward artificial general intelligence. This survey systematically deconstructs LLM agent systems through a methodology-centered taxonomy, linking architec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21460v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21460v1-abstract-full" style="display: none;"> The era of intelligent agents is upon us, driven by revolutionary advancements in large language models. Large Language Model (LLM) agents, with goal-driven behaviors and dynamic adaptation capabilities, potentially represent a critical pathway toward artificial general intelligence. This survey systematically deconstructs LLM agent systems through a methodology-centered taxonomy, linking architectural foundations, collaboration mechanisms, and evolutionary pathways. We unify fragmented research threads by revealing fundamental connections between agent design principles and their emergent behaviors in complex environments. Our work provides a unified architectural perspective, examining how agents are constructed, how they collaborate, and how they evolve over time, while also addressing evaluation methodologies, tool applications, practical challenges, and diverse application domains. By surveying the latest developments in this rapidly evolving field, we offer researchers a structured taxonomy for understanding LLM agents and identify promising directions for future research. The collection is available at https://github.com/luo-junyu/Awesome-Agent-Papers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21460v1-abstract-full').style.display = 'none'; document.getElementById('2503.21460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">329 papers surveyed, resources are at https://github.com/luo-junyu/Awesome-Agent-Papers</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21297">arXiv:2503.21297</a> <span> [<a href="https://arxiv.org/pdf/2503.21297">pdf</a>, <a href="https://arxiv.org/format/2503.21297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> MLDSE: Scaling Design Space Exploration Infrastructure for Multi-Level Hardware </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+H">Huanyu Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junfeng Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Songchen Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongyi Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Luping Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21297v1-abstract-short" style="display: inline;"> To efficiently support large-scale NNs, multi-level hardware, leveraging advanced integration and interconnection technologies, has emerged as a promising solution to counter the slowdown of Moore's law. However, the vast design space of such hardware, coupled with the complexity of their spatial hierarchies and organizations, introduces significant challenges for design space exploration (DSE). E… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21297v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21297v1-abstract-full" style="display: none;"> To efficiently support large-scale NNs, multi-level hardware, leveraging advanced integration and interconnection technologies, has emerged as a promising solution to counter the slowdown of Moore's law. However, the vast design space of such hardware, coupled with the complexity of their spatial hierarchies and organizations, introduces significant challenges for design space exploration (DSE). Existing DSE tools, which rely on predefined hardware templates to explore parameters for specific architectures, fall short in exploring diverse organizations, spatial hierarchies, and architectural polymorphisms inherent in multi-level hardware. To address these limitations, we present Multi-Level Design Space Exploror (MLDSE), a novel infrastructure for domain-specific DSE of multi-level hardware. MLDSE introduces three key innovations from three basic perspectives of DSE: 1) Modeling: MLDSE introduces a hardware intermediate representation (IR) that can recursively model diverse multi-level hardware with composable elements at various granularities. 2) Mapping: MLDSE provides a comprehensive spatiotemporal mapping IR and mapping primitives, facilitating the mapping strategy exploration on multi-level hardware, especially synchronization and cross-level communication; 3) Simulation: MLDSE supports universal simulator generation based on task-level event-driven simulation mechanism. It features a hardware-consistent scheduling algorithm that can handle general task-level resource contention. Through experiments on LLM workloads, we demonstrate MLDSE's unique capability to perform three-tier DSE spanning architecture, hardware parameter, and mapping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21297v1-abstract-full').style.display = 'none'; document.getElementById('2503.21297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21236">arXiv:2503.21236</a> <span> [<a href="https://arxiv.org/pdf/2503.21236">pdf</a>, <a href="https://arxiv.org/format/2503.21236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Clean Image May be Dangerous: Data Poisoning Attacks Against Deep Hashing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuai Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuang Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kejiang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21236v1-abstract-short" style="display: inline;"> Large-scale image retrieval using deep hashing has become increasingly popular due to the exponential growth of image data and the remarkable feature extraction capabilities of deep neural networks (DNNs). However, deep hashing methods are vulnerable to malicious attacks, including adversarial and backdoor attacks. It is worth noting that these attacks typically involve altering the query images,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21236v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21236v1-abstract-full" style="display: none;"> Large-scale image retrieval using deep hashing has become increasingly popular due to the exponential growth of image data and the remarkable feature extraction capabilities of deep neural networks (DNNs). However, deep hashing methods are vulnerable to malicious attacks, including adversarial and backdoor attacks. It is worth noting that these attacks typically involve altering the query images, which is not a practical concern in real-world scenarios. In this paper, we point out that even clean query images can be dangerous, inducing malicious target retrieval results, like undesired or illegal images. To the best of our knowledge, we are the first to study data \textbf{p}oisoning \textbf{a}ttacks against \textbf{d}eep \textbf{hash}ing \textbf{(\textit{PADHASH})}. Specifically, we first train a surrogate model to simulate the behavior of the target deep hashing model. Then, a strict gradient matching strategy is proposed to generate the poisoned images. Extensive experiments on different models, datasets, hash methods, and hash code lengths demonstrate the effectiveness and generality of our attack method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21236v1-abstract-full').style.display = 'none'; document.getElementById('2503.21236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TMM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21206">arXiv:2503.21206</a> <span> [<a href="https://arxiv.org/pdf/2503.21206">pdf</a>, <a href="https://arxiv.org/format/2503.21206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> PilotANN: Memory-Bounded GPU Acceleration for Vector Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+Y">Yuntao Gui</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+P">Peiqi Yin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiao Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaorui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weixi Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">James Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21206v1-abstract-short" style="display: inline;"> Approximate Nearest Neighbor Search (ANNS) has become fundamental to modern deep learning applications, having gained particular prominence through its integration into recent generative models that work with increasingly complex datasets and higher vector dimensions. Existing CPU-only solutions, even the most efficient graph-based ones, struggle to meet these growing computational demands, while… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21206v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21206v1-abstract-full" style="display: none;"> Approximate Nearest Neighbor Search (ANNS) has become fundamental to modern deep learning applications, having gained particular prominence through its integration into recent generative models that work with increasingly complex datasets and higher vector dimensions. Existing CPU-only solutions, even the most efficient graph-based ones, struggle to meet these growing computational demands, while GPU-only solutions face memory constraints. As a solution, we propose PilotANN, a hybrid CPU-GPU system for graph-based ANNS that utilizes both CPU's abundant RAM and GPU's parallel processing capabilities. Our approach decomposes the graph traversal process of top-$k$ search into three stages: GPU-accelerated subgraph traversal using SVD-reduced vectors, CPU refinement and precise search using complete vectors. Furthermore, we introduce fast entry selection to improve search starting points while maximizing GPU utilization. Experimental results demonstrate that PilotANN achieves $3.9 - 5.4 \times$ speedup in throughput on 100-million scale datasets, and is able to handle datasets up to $12 \times$ larger than the GPU memory. We offer a complete open-source implementation at https://github.com/ytgui/PilotANN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21206v1-abstract-full').style.display = 'none'; document.getElementById('2503.21206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21197">arXiv:2503.21197</a> <span> [<a href="https://arxiv.org/pdf/2503.21197">pdf</a>, <a href="https://arxiv.org/ps/2503.21197">ps</a>, <a href="https://arxiv.org/format/2503.21197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WVSC: Wireless Video Semantic Communication with Multi-frame Compensation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+B">Bingyan Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuxuan Shi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Biqian Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+J">Jihong Park</a>, <a href="/search/cs?searchtype=author&query=Quek%2C+T+Q+S">Tony Q. S. Quek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21197v1-abstract-short" style="display: inline;"> Existing wireless video transmission schemes directly conduct video coding in pixel level, while neglecting the inner semantics contained in videos. In this paper, we propose a wireless video semantic communication framework, abbreviated as WVSC, which integrates the idea of semantic communication into wireless video transmission scenarios. WVSC first encodes original video frames as semantic fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21197v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21197v1-abstract-full" style="display: none;"> Existing wireless video transmission schemes directly conduct video coding in pixel level, while neglecting the inner semantics contained in videos. In this paper, we propose a wireless video semantic communication framework, abbreviated as WVSC, which integrates the idea of semantic communication into wireless video transmission scenarios. WVSC first encodes original video frames as semantic frames and then conducts video coding based on such compact representations, enabling the video coding in semantic level rather than pixel level. Moreover, to further reduce the communication overhead, a reference semantic frame is introduced to substitute motion vectors of each frame in common video coding methods. At the receiver, multi-frame compensation (MFC) is proposed to produce compensated current semantic frame with a multi-frame fusion attention module. With both the reference frame transmission and MFC, the bandwidth efficiency improves with satisfying video transmission performance. Experimental results verify the performance gain of WVSC over other DL-based methods e.g. DVSC about 1 dB and traditional schemes about 2 dB in terms of PSNR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21197v1-abstract-full').style.display = 'none'; document.getElementById('2503.21197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20212">arXiv:2503.20212</a> <span> [<a href="https://arxiv.org/pdf/2503.20212">pdf</a>, <a href="https://arxiv.org/format/2503.20212">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dolphin: A Large-Scale Automatic Speech Recognition Model for Eastern Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+Y">Yangyang Meng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinpeng Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Guodong Lin</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yu Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanbo Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hu Du</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhiming Shao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yukai Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei-Qiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20212v1-abstract-short" style="display: inline;"> This report introduces Dolphin, a large-scale multilingual automatic speech recognition (ASR) model that extends the Whisper architecture to support a wider range of languages. Our approach integrates in-house proprietary and open-source datasets to refine and optimize Dolphin's performance. The model is specifically designed to achieve notable recognition accuracy for 40 Eastern languages across… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20212v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20212v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20212v1-abstract-full" style="display: none;"> This report introduces Dolphin, a large-scale multilingual automatic speech recognition (ASR) model that extends the Whisper architecture to support a wider range of languages. Our approach integrates in-house proprietary and open-source datasets to refine and optimize Dolphin's performance. The model is specifically designed to achieve notable recognition accuracy for 40 Eastern languages across East Asia, South Asia, Southeast Asia, and the Middle East, while also supporting 22 Chinese dialects. Experimental evaluations show that Dolphin significantly outperforms current state-of-the-art open-source models across various languages. To promote reproducibility and community-driven innovation, we are making our trained models and inference source code publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20212v1-abstract-full').style.display = 'none'; document.getElementById('2503.20212v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19912">arXiv:2503.19912</a> <span> [<a href="https://arxiv.org/pdf/2503.19912">pdf</a>, <a href="https://arxiv.org/format/2503.19912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SuperFlow++: Enhanced Spatiotemporal Consistency for Cross-Modal Data Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiang Xu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingdong Kong</a>, <a href="/search/cs?searchtype=author&query=Shuai%2C+H">Hui Shuai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liang Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingshan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19912v1-abstract-short" style="display: inline;"> LiDAR representation learning has emerged as a promising approach to reducing reliance on costly and labor-intensive human annotations. While existing methods primarily focus on spatial alignment between LiDAR and camera sensors, they often overlook the temporal dynamics critical for capturing motion and scene continuity in driving scenarios. To address this limitation, we propose SuperFlow++, a n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19912v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19912v1-abstract-full" style="display: none;"> LiDAR representation learning has emerged as a promising approach to reducing reliance on costly and labor-intensive human annotations. While existing methods primarily focus on spatial alignment between LiDAR and camera sensors, they often overlook the temporal dynamics critical for capturing motion and scene continuity in driving scenarios. To address this limitation, we propose SuperFlow++, a novel framework that integrates spatiotemporal cues in both pretraining and downstream tasks using consecutive LiDAR-camera pairs. SuperFlow++ introduces four key components: (1) a view consistency alignment module to unify semantic information across camera views, (2) a dense-to-sparse consistency regularization mechanism to enhance feature robustness across varying point cloud densities, (3) a flow-based contrastive learning approach that models temporal relationships for improved scene understanding, and (4) a temporal voting strategy that propagates semantic information across LiDAR scans to improve prediction consistency. Extensive evaluations on 11 heterogeneous LiDAR datasets demonstrate that SuperFlow++ outperforms state-of-the-art methods across diverse tasks and driving conditions. Furthermore, by scaling both 2D and 3D backbones during pretraining, we uncover emergent properties that provide deeper insights into developing scalable 3D foundation models. With strong generalizability and computational efficiency, SuperFlow++ establishes a new benchmark for data-efficient LiDAR-based perception in autonomous driving. The code is publicly available at https://github.com/Xiangxu-0103/SuperFlow <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19912v1-abstract-full').style.display = 'none'; document.getElementById('2503.19912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint; 15 pages, 6 figures, 10 tables; Code at https://github.com/Xiangxu-0103/SuperFlow</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19612">arXiv:2503.19612</a> <span> [<a href="https://arxiv.org/pdf/2503.19612">pdf</a>, <a href="https://arxiv.org/format/2503.19612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RL-finetuning LLMs from on- and off-policy data with a single algorithm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+T">Taco Cohen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D+W">David W. Zhang</a>, <a href="/search/cs?searchtype=author&query=Valko%2C+M">Michal Valko</a>, <a href="/search/cs?searchtype=author&query=Munos%2C+R">R茅mi Munos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19612v1-abstract-short" style="display: inline;"> We introduce a novel reinforcement learning algorithm (AGRO, for Any-Generation Reward Optimization) for fine-tuning large-language models. AGRO leverages the concept of generation consistency, which states that the optimal policy satisfies the notion of consistency across any possible generation of the model. We derive algorithms that find optimal solutions via the sample-based policy gradient an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19612v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19612v1-abstract-full" style="display: none;"> We introduce a novel reinforcement learning algorithm (AGRO, for Any-Generation Reward Optimization) for fine-tuning large-language models. AGRO leverages the concept of generation consistency, which states that the optimal policy satisfies the notion of consistency across any possible generation of the model. We derive algorithms that find optimal solutions via the sample-based policy gradient and provide theoretical guarantees on their convergence. Our experiments demonstrate the effectiveness of AGRO in both on-policy and off-policy settings, showing improved performance on the mathematical reasoning dataset over baseline algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19612v1-abstract-full').style.display = 'none'; document.getElementById('2503.19612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19573">arXiv:2503.19573</a> <span> [<a href="https://arxiv.org/pdf/2503.19573">pdf</a>, <a href="https://arxiv.org/format/2503.19573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Motif Counting in Complex Networks: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+H">Haozhe Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yizhang He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuemin Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19573v1-abstract-short" style="display: inline;"> Motif counting plays a crucial role in understanding the structural properties of networks. By computing motif frequencies, researchers can draw key insights into the structural properties of the underlying network. As networks become increasingly complex, different graph models have been proposed, giving rise to diverse motif patterns. These variations introduce unique computational challenges th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19573v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19573v1-abstract-full" style="display: none;"> Motif counting plays a crucial role in understanding the structural properties of networks. By computing motif frequencies, researchers can draw key insights into the structural properties of the underlying network. As networks become increasingly complex, different graph models have been proposed, giving rise to diverse motif patterns. These variations introduce unique computational challenges that require specialized algorithms tailored to specific motifs within different graph structures. This survey provides a comprehensive and structured overview of motif counting techniques across general graphs, heterogeneous graphs, and hypergraphs. We categorize existing algorithms according to their underlying computational strategies, emphasizing key similarities and distinctions. In addition to reviewing current methodologies, we examine their strengths, limitations, and computational trade-offs. Furthermore, we explore future directions in motif counting, including scalable implementations to improve efficiency in large-scale networks, algorithmic adaptations for dynamic, temporal, and attributed graphs, and deeper integration with large language models (LLMs) and graph-based retrieval-augmented generation (GraphRAG). By offering a detailed analysis of these approaches, this survey aims to support researchers and practitioners in advancing motif counting for increasingly complex network data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19573v1-abstract-full').style.display = 'none'; document.getElementById('2503.19573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19499">arXiv:2503.19499</a> <span> [<a href="https://arxiv.org/pdf/2503.19499">pdf</a>, <a href="https://arxiv.org/format/2503.19499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> SparSamp: Efficient Provably Secure Steganography Based on Sparse Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaofei Wang</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+G">Gang Pei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kejiang Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jinyang Ding</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+C">Chao Pan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+W">Weilong Pang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Donghui Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19499v1-abstract-short" style="display: inline;"> Steganography embeds confidential data within seemingly innocuous communications. Provable security in steganography, a long-sought goal, has become feasible with deep generative models. However, existing methods face a critical trade-off between security and efficiency. This paper introduces SparSamp, an efficient provably secure steganography method based on sparse sampling. SparSamp embeds mess… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19499v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19499v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19499v1-abstract-full" style="display: none;"> Steganography embeds confidential data within seemingly innocuous communications. Provable security in steganography, a long-sought goal, has become feasible with deep generative models. However, existing methods face a critical trade-off between security and efficiency. This paper introduces SparSamp, an efficient provably secure steganography method based on sparse sampling. SparSamp embeds messages by combining them with pseudo-random numbers to obtain message-derived random numbers for sampling. It enhances extraction accuracy and embedding capacity by increasing the sampling intervals and making the sampling process sparse. SparSamp preserves the original probability distribution of the generative model, thus ensuring security. It introduces only $O(1)$ additional complexity per sampling step, enabling the fastest embedding speed without compromising generation speed. SparSamp is designed to be plug-and-play; message embedding can be achieved by simply replacing the sampling component of an existing generative model with SparSamp. We implemented SparSamp in text, image, and audio generation models. It can achieve embedding speeds of up to 755 bits/second with GPT-2, 5046 bits/second with DDPM, and 9,223 bits/second with WaveRNN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19499v1-abstract-full').style.display = 'none'; document.getElementById('2503.19499v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To Appear in the 34th USENIX Security Symposium (USENIX Security '25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19470">arXiv:2503.19470</a> <span> [<a href="https://arxiv.org/pdf/2503.19470">pdf</a>, <a href="https://arxiv.org/format/2503.19470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ReSearch: Learning to Reason with Search for LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingyang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoze Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yijie Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenzheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofen Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J+Z">Jeff Z. Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zenan Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19470v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable capabilities in reasoning, exemplified by the success of OpenAI-o1 and DeepSeek-R1. However, integrating reasoning with external search processes remains challenging, especially for complex multi-hop questions requiring multiple retrieval steps. We propose ReSearch, a novel framework that trains LLMs to Reason with Search via reinforcement learnin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19470v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19470v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19470v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable capabilities in reasoning, exemplified by the success of OpenAI-o1 and DeepSeek-R1. However, integrating reasoning with external search processes remains challenging, especially for complex multi-hop questions requiring multiple retrieval steps. We propose ReSearch, a novel framework that trains LLMs to Reason with Search via reinforcement learning without using any supervised data on reasoning steps. Our approach treats search operations as integral components of the reasoning chain, where when and how to perform searches is guided by text-based thinking, and search results subsequently influence further reasoning. We train ReSearch on Qwen2.5-7B(-Instruct) and Qwen2.5-32B(-Instruct) models and conduct extensive experiments. Despite being trained on only one dataset, our models demonstrate strong generalizability across various benchmarks. Analysis reveals that ReSearch naturally elicits advanced reasoning capabilities such as reflection and self-correction during the reinforcement learning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19470v2-abstract-full').style.display = 'none'; document.getElementById('2503.19470v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18854">arXiv:2503.18854</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MC-LLaVA: Multi-Concept Personalized Vision-Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+R">Ruichuan An</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sihan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+K">Kai Zeng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yulin Luo</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiajun Cao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=She%2C+Q">Qi She</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18854v2-abstract-short" style="display: inline;"> Current vision-language models (VLMs) show exceptional abilities across diverse tasks, such as visual question answering. To enhance user experience, recent studies investigate VLM personalization to understand user-provided concepts. However, they mainly focus on single-concept personalization, neglecting the existence and interplay of multiple concepts, which limits real-world applicability. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18854v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18854v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18854v2-abstract-full" style="display: none;"> Current vision-language models (VLMs) show exceptional abilities across diverse tasks, such as visual question answering. To enhance user experience, recent studies investigate VLM personalization to understand user-provided concepts. However, they mainly focus on single-concept personalization, neglecting the existence and interplay of multiple concepts, which limits real-world applicability. This paper proposes the first multi-concept personalization paradigm, MC-LLaVA. Specifically, MC-LLaVA employs a multi-concept instruction tuning strategy, effectively integrating multiple concepts in a single training step. To reduce the costs related to joint training, we propose a personalized textual prompt that uses visual token information to initialize concept tokens. Additionally, we introduce a personalized visual prompt during inference, aggregating location confidence maps for enhanced recognition and grounding capabilities. To advance multi-concept personalization research, we further contribute a high-quality instruction tuning dataset. We carefully collect images with multiple characters and objects from movies and manually generate question-answer samples for multi-concept scenarios, featuring superior diversity. Comprehensive qualitative and quantitative experiments demonstrate that MC-LLaVA can achieve impressive multi-concept personalized responses, paving the way for VLMs to become better user-specific assistants. The code and dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18854v2-abstract-full').style.display = 'none'; document.getElementById('2503.18854v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">I sincerely apologize for any inconvenience caused. We actually uploaded this paper to arXiv in November 2024, as arXiv:2411.11706. During this update, we did not consider the replacement operation of arXiv, which led to duplicate submissions. We have made modifications at the original address arXiv:2411.11706</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18665">arXiv:2503.18665</a> <span> [<a href="https://arxiv.org/pdf/2503.18665">pdf</a>, <a href="https://arxiv.org/format/2503.18665">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boosting Virtual Agent Learning and Reasoning: A Step-wise, Multi-dimensional, and Generalist Reward Model with Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+B">Bingchen Miao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yang Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minghe Gao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qifan Yu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+W">Wendong Bu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunfei Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Chua%2C+T">Tat-Seng Chua</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juncheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18665v1-abstract-short" style="display: inline;"> The development of Generalist Virtual Agents (GVAs) powered by Multimodal Large Language Models (MLLMs) has shown significant promise in autonomous task execution. However, current training paradigms face critical limitations, including reliance on outcome supervision and labor-intensive human annotations. To address these challenges, we propose Similar, a Step-wise Multi-dimensional Generalist Re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18665v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18665v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18665v1-abstract-full" style="display: none;"> The development of Generalist Virtual Agents (GVAs) powered by Multimodal Large Language Models (MLLMs) has shown significant promise in autonomous task execution. However, current training paradigms face critical limitations, including reliance on outcome supervision and labor-intensive human annotations. To address these challenges, we propose Similar, a Step-wise Multi-dimensional Generalist Reward Model, which offers fine-grained signals for agent training and can choose better action for inference-time scaling. Specifically, we begin by systematically defining five dimensions for evaluating agent actions. Building on this framework, we design an MCTS-P algorithm to automatically collect and annotate step-wise, five-dimensional agent execution data. Using this data, we train Similar with the Triple-M strategy. Furthermore, we introduce the first benchmark in the virtual agent domain for step-wise, multi-dimensional reward model training and evaluation, named SRM. This benchmark consists of two components: SRMTrain, which serves as the training set for Similar, and SRMEval, a manually selected test set for evaluating the reward model. Experimental results demonstrate that Similar, through its step-wise, multi-dimensional assessment and synergistic gain, provides GVAs with effective intermediate signals during both training and inference-time scaling. The code is available at https://github.com/Galery23/Similar-v1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18665v1-abstract-full').style.display = 'none'; document.getElementById('2503.18665v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18641">arXiv:2503.18641</a> <span> [<a href="https://arxiv.org/pdf/2503.18641">pdf</a>, <a href="https://arxiv.org/format/2503.18641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From Fragment to One Piece: A Survey on AI-Driven Graphic Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+X">Xingxing Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+N">Nanxuan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18641v1-abstract-short" style="display: inline;"> This survey provides a comprehensive overview of the advancements in Artificial Intelligence in Graphic Design (AIGD), focusing on integrating AI techniques to support design interpretation and enhance the creative process. We categorize the field into two primary directions: perception tasks, which involve understanding and analyzing design elements, and generation tasks, which focus on creating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18641v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18641v1-abstract-full" style="display: none;"> This survey provides a comprehensive overview of the advancements in Artificial Intelligence in Graphic Design (AIGD), focusing on integrating AI techniques to support design interpretation and enhance the creative process. We categorize the field into two primary directions: perception tasks, which involve understanding and analyzing design elements, and generation tasks, which focus on creating new design elements and layouts. The survey covers various subtasks, including visual element perception and generation, aesthetic and semantic understanding, layout analysis, and generation. We highlight the role of large language models and multimodal approaches in bridging the gap between localized visual features and global design intent. Despite significant progress, challenges remain to understanding human intent, ensuring interpretability, and maintaining control over multilayered compositions. This survey serves as a guide for researchers, providing information on the current state of AIGD and potential future directions\footnote{https://github.com/zhangtianer521/excellent\_Intelligent\_graphic\_design}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18641v1-abstract-full').style.display = 'none'; document.getElementById('2503.18641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18363">arXiv:2503.18363</a> <span> [<a href="https://arxiv.org/pdf/2503.18363">pdf</a>, <a href="https://arxiv.org/format/2503.18363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MonoInstance: Enhancing Monocular Priors via Multi-view Instance Alignment for Neural Rendering and Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yixiao Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Han Huang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Liang Han</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kanle Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18363v1-abstract-short" style="display: inline;"> Monocular depth priors have been widely adopted by neural rendering in multi-view based tasks such as 3D reconstruction and novel view synthesis. However, due to the inconsistent prediction on each view, how to more effectively leverage monocular cues in a multi-view context remains a challenge. Current methods treat the entire estimated depth map indiscriminately, and use it as ground truth super… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18363v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18363v1-abstract-full" style="display: none;"> Monocular depth priors have been widely adopted by neural rendering in multi-view based tasks such as 3D reconstruction and novel view synthesis. However, due to the inconsistent prediction on each view, how to more effectively leverage monocular cues in a multi-view context remains a challenge. Current methods treat the entire estimated depth map indiscriminately, and use it as ground truth supervision, while ignoring the inherent inaccuracy and cross-view inconsistency in monocular priors. To resolve these issues, we propose MonoInstance, a general approach that explores the uncertainty of monocular depths to provide enhanced geometric priors for neural rendering and reconstruction. Our key insight lies in aligning each segmented instance depths from multiple views within a common 3D space, thereby casting the uncertainty estimation of monocular depths into a density measure within noisy point clouds. For high-uncertainty areas where depth priors are unreliable, we further introduce a constraint term that encourages the projected instances to align with corresponding instance masks on nearby views. MonoInstance is a versatile strategy which can be seamlessly integrated into various multi-view neural rendering frameworks. Our experimental results demonstrate that MonoInstance significantly improves the performance in both reconstruction and novel view synthesis under various benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18363v1-abstract-full').style.display = 'none'; document.getElementById('2503.18363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025. Project page: https://wen-yuan-zhang.github.io/MonoInstance/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18361">arXiv:2503.18361</a> <span> [<a href="https://arxiv.org/pdf/2503.18361">pdf</a>, <a href="https://arxiv.org/format/2503.18361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NeRFPrior: Learning Neural Radiance Field as a Prior for Indoor Scene Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+E+Y">Emily Yue-ting Jia</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junsheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Baorui Ma</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kanle Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Shen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18361v1-abstract-short" style="display: inline;"> Recently, it has shown that priors are vital for neural implicit functions to reconstruct high-quality surfaces from multi-view RGB images. However, current priors require large-scale pre-training, and merely provide geometric clues without considering the importance of color. In this paper, we present NeRFPrior, which adopts a neural radiance field as a prior to learn signed distance fields using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18361v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18361v1-abstract-full" style="display: none;"> Recently, it has shown that priors are vital for neural implicit functions to reconstruct high-quality surfaces from multi-view RGB images. However, current priors require large-scale pre-training, and merely provide geometric clues without considering the importance of color. In this paper, we present NeRFPrior, which adopts a neural radiance field as a prior to learn signed distance fields using volume rendering for surface reconstruction. Our NeRF prior can provide both geometric and color clues, and also get trained fast under the same scene without additional data. Based on the NeRF prior, we are enabled to learn a signed distance function (SDF) by explicitly imposing a multi-view consistency constraint on each ray intersection for surface inference. Specifically, at each ray intersection, we use the density in the prior as a coarse geometry estimation, while using the color near the surface as a clue to check its visibility from another view angle. For the textureless areas where the multi-view consistency constraint does not work well, we further introduce a depth consistency loss with confidence weights to infer the SDF. Our experimental results outperform the state-of-the-art methods under the widely used benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18361v1-abstract-full').style.display = 'none'; document.getElementById('2503.18361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025. Project page: https://wen-yuan-zhang.github.io/NeRFPrior/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18245">arXiv:2503.18245</a> <span> [<a href="https://arxiv.org/pdf/2503.18245">pdf</a>, <a href="https://arxiv.org/format/2503.18245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffGED: Computing Graph Edit Distance via Diffusion-based Graph Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanchen Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dong Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuemin Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18245v1-abstract-short" style="display: inline;"> The Graph Edit Distance (GED) problem, which aims to compute the minimum number of edit operations required to transform one graph into another, is a fundamental challenge in graph analysis with wide-ranging applications. However, due to its NP-hard nature, traditional A* approaches often suffer from scalability issue, making them computationally intractable for large graphs. Many recent deep lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18245v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18245v1-abstract-full" style="display: none;"> The Graph Edit Distance (GED) problem, which aims to compute the minimum number of edit operations required to transform one graph into another, is a fundamental challenge in graph analysis with wide-ranging applications. However, due to its NP-hard nature, traditional A* approaches often suffer from scalability issue, making them computationally intractable for large graphs. Many recent deep learning frameworks address GED by formulating it as a regression task, which, while efficient, fails to recover the edit path -- a central interest in GED. Furthermore, recent hybrid approaches that combine deep learning with traditional methods to recover the edit path often yield poor solution quality. These methods also struggle to generate candidate solutions in parallel, resulting in increased running times.In this paper, we present a novel approach, DiffGED, that leverages generative diffusion model to solve GED and recover the corresponding edit path. Specifically, we first generate multiple diverse node matching matrices in parallel through a diffusion-based graph matching model. Next, node mappings are extracted from each generated matching matrices in parallel, and each extracted node mapping can be simply transformed into an edit path. Benefiting from the generative diversity provided by the diffusion model, DiffGED is less likely to fall into local sub-optimal solutions, thereby achieving superior overall solution quality close to the exact solution. Experimental results on real-world datasets demonstrate that DiffGED can generate multiple diverse edit paths with exceptionally high accuracy comparable to exact solutions while maintaining a running time shorter than most of hybrid approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18245v1-abstract-full').style.display = 'none'; document.getElementById('2503.18245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17793">arXiv:2503.17793</a> <span> [<a href="https://arxiv.org/pdf/2503.17793">pdf</a>, <a href="https://arxiv.org/format/2503.17793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Every Sample Matters: Leveraging Mixture-of-Experts and High-Quality Data for Efficient and Accurate Code LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Codefuse"> Codefuse</a>, <a href="/search/cs?searchtype=author&query=Team%2C+L">Ling Team</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+W">Wenting Cai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuchen Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siba Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qing Cui</a>, <a href="/search/cs?searchtype=author&query=Di%2C+P">Peng Di</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junpeng Fang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zi Gong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Ting Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhengyu He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianguo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+S">Shijie Lian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">BingChang Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+S">Songshan Luo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shuo Mao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+M">Min Shen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaolong Yang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17793v1-abstract-short" style="display: inline;"> Recent advancements in code large language models (LLMs) have demonstrated remarkable capabilities in code generation and understanding. It is still challenging to build a code LLM with comprehensive performance yet ultimate efficiency. Many attempts have been released in the open source community to break the trade-off between performance and efficiency, such as the Qwen Coder series and the Deep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17793v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17793v1-abstract-full" style="display: none;"> Recent advancements in code large language models (LLMs) have demonstrated remarkable capabilities in code generation and understanding. It is still challenging to build a code LLM with comprehensive performance yet ultimate efficiency. Many attempts have been released in the open source community to break the trade-off between performance and efficiency, such as the Qwen Coder series and the DeepSeek Coder series. This paper introduces yet another attempt in this area, namely Ling-Coder-Lite. We leverage the efficient Mixture-of-Experts (MoE) architecture along with a set of high-quality data curation methods (especially those based on program analytics) to build an efficient yet powerful code LLM. Ling-Coder-Lite exhibits on-par performance on 12 representative coding benchmarks compared to state-of-the-art models of similar size, such as Qwen2.5-Coder-7B and DeepSeek-Coder-V2-Lite, while offering competitive latency and throughput. In practice, we achieve a 50\% reduction in deployment resources compared to the similar-sized dense model without performance loss. To facilitate further research and development in this area, we open-source our models as well as a substantial portion of high-quality data for the annealing and post-training stages. The models and data can be accessed at~\url{https://huggingface.co/inclusionAI/Ling-Coder-lite}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17793v1-abstract-full').style.display = 'none'; document.getElementById('2503.17793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17088">arXiv:2503.17088</a> <span> [<a href="https://arxiv.org/pdf/2503.17088">pdf</a>, <a href="https://arxiv.org/ps/2503.17088">ps</a>, <a href="https://arxiv.org/format/2503.17088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Unsourced Random Access in MIMO Quasi-Static Rayleigh Fading Channels: Finite Blocklength and Scaling Law Analyses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyuan Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yongpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Caire%2C+G">Giuseppe Caire</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Poor%2C+H+V">H. Vincent Poor</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17088v1-abstract-short" style="display: inline;"> This paper considers the unsourced random access (URA) problem with a random and unknown number of active users in multiple-input multiple-output (MIMO) quasi-static Rayleigh fading channels. We derive non-asymptotic achievability bounds on the probability of incorrectly estimating the number of active users, and provide scaling laws on the gap between the estimated and true numbers of active user… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17088v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17088v1-abstract-full" style="display: none;"> This paper considers the unsourced random access (URA) problem with a random and unknown number of active users in multiple-input multiple-output (MIMO) quasi-static Rayleigh fading channels. We derive non-asymptotic achievability bounds on the probability of incorrectly estimating the number of active users, and provide scaling laws on the gap between the estimated and true numbers of active users. We prove that the error probability reaches a plateau as the power $P$ and blocklength $n$ increase, whereas it decays exponentially with the number $L$ of receive antennas and eventually vanishes. Then, we explore the fundamental limits of URA by deriving non-asymptotic achievability bounds and converse bounds (including two single-user converse bounds and one multi-user ensemble converse bound) on the minimum energy-per-bit required by each active user to transmit $J$ bits with blocklength $n$ under misdetection and false-alarm constraints. Numerical results show that the extra required energy-per-bit due to the uncertainty in the number ${\rm{K}}_a$ of active users decreases as $L$ and $\mathbb{E}[{\rm{K}}_a]$ increase and the error requirement becomes milder. In the non-asymptotic regime, using codewords distributed on a sphere outperforms Gaussian random coding. Existing schemes are shown to exhibit a large gap to our bounds when the number of active users is large, calling for more advanced schemes that perform energy-efficiently in this case. In the asymptotic regime with $n\to\infty$, we establish scaling laws on the minimum required $P$ and $L$ to reliably support ${\rm{K}}_a$ active users as functions of $n$, which highlight the potential of MIMO in enabling low-cost communication and indicate that it is possible for the minimum required $P$ and $L$ to remain on the same order when the number of active users increases but stays below a threshold. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17088v1-abstract-full').style.display = 'none'; document.getElementById('2503.17088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Information Theory</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17050">arXiv:2503.17050</a> <span> [<a href="https://arxiv.org/pdf/2503.17050">pdf</a>, <a href="https://arxiv.org/format/2503.17050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scoring, Remember, and Reference: Catching Camouflaged Objects in Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuang Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+F">Fuzhen Yan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yicheng Song</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+L">Lingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17050v1-abstract-short" style="display: inline;"> Video Camouflaged Object Detection (VCOD) aims to segment objects whose appearances closely resemble their surroundings, posing a challenging and emerging task. Existing vision models often struggle in such scenarios due to the indistinguishable appearance of camouflaged objects and the insufficient exploitation of dynamic information in videos. To address these challenges, we propose an end-to-en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17050v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17050v1-abstract-full" style="display: none;"> Video Camouflaged Object Detection (VCOD) aims to segment objects whose appearances closely resemble their surroundings, posing a challenging and emerging task. Existing vision models often struggle in such scenarios due to the indistinguishable appearance of camouflaged objects and the insufficient exploitation of dynamic information in videos. To address these challenges, we propose an end-to-end VCOD framework inspired by human memory-recognition, which leverages historical video information by integrating memory reference frames for camouflaged sequence processing. Specifically, we design a dual-purpose decoder that simultaneously generates predicted masks and scores, enabling reference frame selection based on scores while introducing auxiliary supervision to enhance feature extraction.Furthermore, this study introduces a novel reference-guided multilevel asymmetric attention mechanism, effectively integrating long-term reference information with short-term motion cues for comprehensive feature extraction. By combining these modules, we develop the Scoring, Remember, and Reference (SRR) framework, which efficiently extracts information to locate targets and employs memory guidance to improve subsequent processing. With its optimized module design and effective utilization of video data, our model achieves significant performance improvements, surpassing existing approaches by 10% on benchmark datasets while requiring fewer parameters (54M) and only a single pass through the video. The code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17050v1-abstract-full').style.display = 'none'; document.getElementById('2503.17050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17029">arXiv:2503.17029</a> <span> [<a href="https://arxiv.org/pdf/2503.17029">pdf</a>, <a href="https://arxiv.org/format/2503.17029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AnimatePainter: A Self-Supervised Rendering Framework for Reconstructing Painting Process </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shuyong Gao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qianyu Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qishan Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuang Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17029v1-abstract-short" style="display: inline;"> Humans can intuitively decompose an image into a sequence of strokes to create a painting, yet existing methods for generating drawing processes are limited to specific data types and often rely on expensive human-annotated datasets. We propose a novel self-supervised framework for generating drawing processes from any type of image, treating the task as a video generation problem. Our approach re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17029v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17029v1-abstract-full" style="display: none;"> Humans can intuitively decompose an image into a sequence of strokes to create a painting, yet existing methods for generating drawing processes are limited to specific data types and often rely on expensive human-annotated datasets. We propose a novel self-supervised framework for generating drawing processes from any type of image, treating the task as a video generation problem. Our approach reverses the drawing process by progressively removing strokes from a reference image, simulating a human-like creation sequence. Crucially, our method does not require costly datasets of real human drawing processes; instead, we leverage depth estimation and stroke rendering to construct a self-supervised dataset. We model human drawings as "refinement" and "layering" processes and introduce depth fusion layers to enable video generation models to learn and replicate human drawing behavior. Extensive experiments validate the effectiveness of our approach, demonstrating its ability to generate realistic drawings without the need for real drawing process data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17029v1-abstract-full').style.display = 'none'; document.getElementById('2503.17029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16963">arXiv:2503.16963</a> <span> [<a href="https://arxiv.org/pdf/2503.16963">pdf</a>, <a href="https://arxiv.org/format/2503.16963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Center-guided Classifier for Semantic Segmentation of Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mengting Ma</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yizhen Jiang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+R">Rongrong Lian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenkai Wu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+K">Kangning Cui</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaowen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16963v1-abstract-short" style="display: inline;"> Compared with natural images, remote sensing images (RSIs) have the unique characteristic. i.e., larger intraclass variance, which makes semantic segmentation for remote sensing images more challenging. Moreover, existing semantic segmentation models for remote sensing images usually employ a vanilla softmax classifier, which has three drawbacks: (1) non-direct supervision for the pixel representa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16963v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16963v1-abstract-full" style="display: none;"> Compared with natural images, remote sensing images (RSIs) have the unique characteristic. i.e., larger intraclass variance, which makes semantic segmentation for remote sensing images more challenging. Moreover, existing semantic segmentation models for remote sensing images usually employ a vanilla softmax classifier, which has three drawbacks: (1) non-direct supervision for the pixel representations during training; (2) inadequate modeling ability of parametric softmax classifiers under large intraclass variance; and (3) opaque process of classification decision. In this paper, we propose a novel classifier (called CenterSeg) customized for RSI semantic segmentation, which solves the abovementioned problems with multiple prototypes, direct supervision under Grassmann manifold, and interpretability strategy. Specifically, for each class, our CenterSeg obtains local class centers by aggregating corresponding pixel features based on ground-truth masks, and generates multiple prototypes through hard attention assignment and momentum updating. In addition, we introduce the Grassmann manifold and constrain the joint embedding space of pixel features and prototypes based on two additional regularization terms. Especially, during the inference, CenterSeg can further provide interpretability to the model by restricting the prototype as a sample of the training set. Experimental results on three remote sensing segmentation datasets validate the effectiveness of the model. Besides the superior performance, CenterSeg has the advantages of simplicity, lightweight, compatibility, and interpretability. Code is available at https://github.com/xwmaxwma/rssegmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16963v1-abstract-full').style.display = 'none'; document.getElementById('2503.16963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16529">arXiv:2503.16529</a> <span> [<a href="https://arxiv.org/pdf/2503.16529">pdf</a>, <a href="https://arxiv.org/format/2503.16529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Safety Evaluation and Enhancement of DeepSeek Models in Chinese Contexts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjing Zhang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+X">Xuejiao Lei</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Limin Han</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jiaojiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Beibei Huang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Z">Zhenhong Long</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junting Guo</a>, <a href="/search/cs?searchtype=author&query=An%2C+M">Meijuan An</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Rongjia Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+S">Shiguo Lian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16529v1-abstract-short" style="display: inline;"> DeepSeek-R1, renowned for its exceptional reasoning capabilities and open-source strategy, is significantly influencing the global artificial intelligence landscape. However, it exhibits notable safety shortcomings. Recent research conducted by Robust Intelligence, a subsidiary of Cisco, in collaboration with the University of Pennsylvania, revealed that DeepSeek-R1 achieves a 100\% attack success… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16529v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16529v1-abstract-full" style="display: none;"> DeepSeek-R1, renowned for its exceptional reasoning capabilities and open-source strategy, is significantly influencing the global artificial intelligence landscape. However, it exhibits notable safety shortcomings. Recent research conducted by Robust Intelligence, a subsidiary of Cisco, in collaboration with the University of Pennsylvania, revealed that DeepSeek-R1 achieves a 100\% attack success rate when processing harmful prompts. Furthermore, multiple security firms and research institutions have identified critical security vulnerabilities within the model. Although China Unicom has uncovered safety vulnerabilities of R1 in Chinese contexts, the safety capabilities of the remaining distilled models in the R1 series have not yet been comprehensively evaluated. To address this gap, this study utilizes the comprehensive Chinese safety benchmark CHiSafetyBench to conduct an in-depth safety evaluation of the DeepSeek-R1 series distilled models. The objective is to assess the safety capabilities of these models in Chinese contexts both before and after distillation, and to further elucidate the adverse effects of distillation on model safety. Building on these findings, we implement targeted safety enhancements for six distilled models. Evaluation results indicate that the enhanced models achieve significant improvements in safety while maintaining reasoning capabilities without notable degradation. We open-source the safety-enhanced models at https://github.com/UnicomAI/DeepSeek-R1-Distill-Safe/tree/main to serve as a valuable resource for future research and optimization of DeepSeek models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16529v1-abstract-full').style.display = 'none'; document.getElementById('2503.16529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages,13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16185">arXiv:2503.16185</a> <span> [<a href="https://arxiv.org/pdf/2503.16185">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MapGlue: Multimodal Remote Sensing Image Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+P">Peihao Wu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yongxiang Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenfei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+D">Dong Wei</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yi Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16185v1-abstract-short" style="display: inline;"> Multimodal remote sensing image (MRSI) matching is pivotal for cross-modal fusion, localization, and object detection, but it faces severe challenges due to geometric, radiometric, and viewpoint discrepancies across imaging modalities. Existing unimodal datasets lack scale and diversity, limiting deep learning solutions. This paper proposes MapGlue, a universal MRSI matching framework, and MapData… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16185v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16185v1-abstract-full" style="display: none;"> Multimodal remote sensing image (MRSI) matching is pivotal for cross-modal fusion, localization, and object detection, but it faces severe challenges due to geometric, radiometric, and viewpoint discrepancies across imaging modalities. Existing unimodal datasets lack scale and diversity, limiting deep learning solutions. This paper proposes MapGlue, a universal MRSI matching framework, and MapData, a large-scale multimodal dataset addressing these gaps. Our contributions are twofold. MapData, a globally diverse dataset spanning 233 sampling points, offers original images (7,000x5,000 to 20,000x15,000 pixels). After rigorous cleaning, it provides 121,781 aligned electronic map-visible image pairs (512x512 pixels) with hybrid manual-automated ground truth, addressing the scarcity of scalable multimodal benchmarks. MapGlue integrates semantic context with a dual graph-guided mechanism to extract cross-modal invariant features. This structure enables global-to-local interaction, enhancing descriptor robustness against modality-specific distortions. Extensive evaluations on MapData and five public datasets demonstrate MapGlue's superiority in matching accuracy under complex conditions, outperforming state-of-the-art methods. Notably, MapGlue generalizes effectively to unseen modalities without retraining, highlighting its adaptability. This work addresses longstanding challenges in MRSI matching by combining scalable dataset construction with a robust, semantics-driven framework. Furthermore, MapGlue shows strong generalization capabilities on other modality matching tasks for which it was not specifically trained. The dataset and code are available at https://github.com/PeihaoWu/MapGlue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16185v1-abstract-full').style.display = 'none'; document.getElementById('2503.16185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The dataset and code are available at https://github.com/PeihaoWu/MapGlue</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15578">arXiv:2503.15578</a> <span> [<a href="https://arxiv.org/pdf/2503.15578">pdf</a>, <a href="https://arxiv.org/format/2503.15578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sparseformer: a Transferable Transformer with Multi-granularity Token Sparsification for Medical Time Series Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiexia Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyue Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Tsung%2C+F">Fugee Tsung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15578v1-abstract-short" style="display: inline;"> Medical time series (MedTS) classification is crucial for improved diagnosis in healthcare, and yet it is challenging due to the varying granularity of patterns, intricate inter-channel correlation, information redundancy, and label scarcity. While existing transformer-based models have shown promise in time series analysis, they mainly focus on forecasting and fail to fully exploit the distinctiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15578v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15578v1-abstract-full" style="display: none;"> Medical time series (MedTS) classification is crucial for improved diagnosis in healthcare, and yet it is challenging due to the varying granularity of patterns, intricate inter-channel correlation, information redundancy, and label scarcity. While existing transformer-based models have shown promise in time series analysis, they mainly focus on forecasting and fail to fully exploit the distinctive characteristics of MedTS data. In this paper, we introduce Sparseformer, a transformer specifically designed for MedTS classification. We propose a sparse token-based dual-attention mechanism that enables global modeling and token compression, allowing dynamic focus on the most informative tokens while distilling redundant features. This mechanism is then applied to the multi-granularity, cross-channel encoding of medical signals, capturing intra- and inter-granularity correlations and inter-channel connections. The sparsification design allows our model to handle heterogeneous inputs of varying lengths and channels directly. Further, we introduce an adaptive label encoder to address label space misalignment across datasets, equipping our model with cross-dataset transferability to alleviate the medical label scarcity issue. Our model outperforms 12 baselines across seven medical datasets under supervised learning. In the few-shot learning experiments, our model also achieves superior average results. In addition, the in-domain and cross-domain experiments among three diagnostic scenarios demonstrate our model's zero-shot learning capability. Collectively, these findings underscore the robustness and transferability of our model in various medical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15578v1-abstract-full').style.display = 'none'; document.getElementById('2503.15578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 figures, 16 pages, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15285">arXiv:2503.15285</a> <span> [<a href="https://arxiv.org/pdf/2503.15285">pdf</a>, <a href="https://arxiv.org/format/2503.15285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PAPI-Reg: Patch-to-Pixel Solution for Efficient Cross-Modal Registration between LiDAR Point Cloud and Camera Image </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yuanchao Yue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengxin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15285v1-abstract-short" style="display: inline;"> The primary requirement for cross-modal data fusion is the precise alignment of data from different sensors. However, the calibration between LiDAR point clouds and camera images is typically time-consuming and needs external calibration board or specific environmental features. Cross-modal registration effectively solves this problem by aligning the data directly without requiring external calibr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15285v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15285v1-abstract-full" style="display: none;"> The primary requirement for cross-modal data fusion is the precise alignment of data from different sensors. However, the calibration between LiDAR point clouds and camera images is typically time-consuming and needs external calibration board or specific environmental features. Cross-modal registration effectively solves this problem by aligning the data directly without requiring external calibration. However, due to the domain gap between the point cloud and the image, existing methods rarely achieve satisfactory registration accuracy while maintaining real-time performance. To address this issue, we propose a framework that projects point clouds into several 2D representations for matching with camera images, which not only leverages the geometric characteristic of LiDAR point clouds more effectively but also bridge the domain gap between the point cloud and image. Moreover, to tackle the challenges of cross modal differences and the limited overlap between LiDAR point clouds and images in the image matching task, we introduce a multi-scale feature extraction network to effectively extract features from both camera images and the projection maps of LiDAR point cloud. Additionally, we propose a patch-to-pixel matching network to provide more effective supervision and achieve higher accuracy. We validate the performance of our model through experiments on the KITTI and nuScenes datasets. Our network achieves real-time performance and extremely high registration accuracy. On the KITTI dataset, our model achieves a registration accuracy rate of over 99\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15285v1-abstract-full').style.display = 'none'; document.getElementById('2503.15285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15092">arXiv:2503.15092</a> <span> [<a href="https://arxiv.org/pdf/2503.15092">pdf</a>, <a href="https://arxiv.org/format/2503.15092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Understanding the Safety Boundaries of DeepSeek Models: Evaluation and Findings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ying%2C+Z">Zonghao Ying</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guangyi Zheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongxin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Deyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Quanchen Zou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aishan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xianglong Liu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15092v1-abstract-short" style="display: inline;"> This study presents the first comprehensive safety evaluation of the DeepSeek models, focusing on evaluating the safety risks associated with their generated content. Our evaluation encompasses DeepSeek's latest generation of large language models, multimodal large language models, and text-to-image models, systematically examining their performance regarding unsafe content generation. Notably, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15092v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15092v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15092v1-abstract-full" style="display: none;"> This study presents the first comprehensive safety evaluation of the DeepSeek models, focusing on evaluating the safety risks associated with their generated content. Our evaluation encompasses DeepSeek's latest generation of large language models, multimodal large language models, and text-to-image models, systematically examining their performance regarding unsafe content generation. Notably, we developed a bilingual (Chinese-English) safety evaluation dataset tailored to Chinese sociocultural contexts, enabling a more thorough evaluation of the safety capabilities of Chinese-developed models. Experimental results indicate that despite their strong general capabilities, DeepSeek models exhibit significant safety vulnerabilities across multiple risk dimensions, including algorithmic discrimination and sexual content. These findings provide crucial insights for understanding and improving the safety of large foundation models. Our code is available at https://github.com/NY1024/DeepSeek-Safety-Eval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15092v1-abstract-full').style.display = 'none'; document.getElementById('2503.15092v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15082">arXiv:2503.15082</a> <span> [<a href="https://arxiv.org/pdf/2503.15082">pdf</a>, <a href="https://arxiv.org/format/2503.15082">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StyleLoco: Generative Adversarial Distillation for Natural Humanoid Robot Locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+L">Le Ma</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Ziyu Meng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tengyu Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuhan Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+R">Ran Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siyuan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15082v1-abstract-short" style="display: inline;"> Humanoid robots are anticipated to acquire a wide range of locomotion capabilities while ensuring natural movement across varying speeds and terrains. Existing methods encounter a fundamental dilemma in learning humanoid locomotion: reinforcement learning with handcrafted rewards can achieve agile locomotion but produces unnatural gaits, while Generative Adversarial Imitation Learning (GAIL) with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15082v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15082v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15082v1-abstract-full" style="display: none;"> Humanoid robots are anticipated to acquire a wide range of locomotion capabilities while ensuring natural movement across varying speeds and terrains. Existing methods encounter a fundamental dilemma in learning humanoid locomotion: reinforcement learning with handcrafted rewards can achieve agile locomotion but produces unnatural gaits, while Generative Adversarial Imitation Learning (GAIL) with motion capture data yields natural movements but suffers from unstable training processes and restricted agility. Integrating these approaches proves challenging due to the inherent heterogeneity between expert policies and human motion datasets. To address this, we introduce StyleLoco, a novel two-stage framework that bridges this gap through a Generative Adversarial Distillation (GAD) process. Our framework begins by training a teacher policy using reinforcement learning to achieve agile and dynamic locomotion. It then employs a multi-discriminator architecture, where distinct discriminators concurrently extract skills from both the teacher policy and motion capture data. This approach effectively combines the agility of reinforcement learning with the natural fluidity of human-like movements while mitigating the instability issues commonly associated with adversarial training. Through extensive simulation and real-world experiments, we demonstrate that StyleLoco enables humanoid robots to perform diverse locomotion tasks with the precision of expertly trained policies and the natural aesthetics of human motion, successfully transferring styles across different movement types while maintaining stable locomotion across a broad spectrum of command inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15082v1-abstract-full').style.display = 'none'; document.getElementById('2503.15082v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14929">arXiv:2503.14929</a> <span> [<a href="https://arxiv.org/pdf/2503.14929">pdf</a>, <a href="https://arxiv.org/format/2503.14929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ACE: A Cardinality Estimator for Set-Valued Queries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+Y">Yufan Sheng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kaiqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yixiang Fang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jianzhong Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+C+S">Christian S. Jensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14929v1-abstract-short" style="display: inline;"> Cardinality estimation is a fundamental functionality in database systems. Most existing cardinality estimators focus on handling predicates over numeric or categorical data. They have largely omitted an important data type, set-valued data, which frequently occur in contemporary applications such as information retrieval and recommender systems. The few existing estimators for such data either fa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14929v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14929v1-abstract-full" style="display: none;"> Cardinality estimation is a fundamental functionality in database systems. Most existing cardinality estimators focus on handling predicates over numeric or categorical data. They have largely omitted an important data type, set-valued data, which frequently occur in contemporary applications such as information retrieval and recommender systems. The few existing estimators for such data either favor high-frequency elements or rely on a partial independence assumption, which limits their practical applicability. We propose ACE, an Attention-based Cardinality Estimator for estimating the cardinality of queries over set-valued data. We first design a distillation-based data encoder to condense the dataset into a compact matrix. We then design an attention-based query analyzer to capture correlations among query elements. To handle variable-sized queries, a pooling module is introduced, followed by a regression model (MLP) to generate final cardinality estimates. We evaluate ACE on three datasets with varying query element distributions, demonstrating that ACE outperforms the state-of-the-art competitors in terms of both accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14929v1-abstract-full').style.display = 'none'; document.getElementById('2503.14929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by PVLDB Vol 18</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14640">arXiv:2503.14640</a> <span> [<a href="https://arxiv.org/pdf/2503.14640">pdf</a>, <a href="https://arxiv.org/format/2503.14640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Accumulated Attention Map for Interpreting Evolution of Decision-Making in Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yi Liao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yongsheng Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weichuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14640v1-abstract-short" style="display: inline;"> Various Vision Transformer (ViT) models have been widely used for image recognition tasks. However, existing visual explanation methods can not display the attention flow hidden inside the inner structure of ViT models, which explains how the final attention regions are formed inside a ViT for its decision-making. In this paper, a novel visual explanation approach, Dynamic Accumulated Attention Ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14640v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14640v1-abstract-full" style="display: none;"> Various Vision Transformer (ViT) models have been widely used for image recognition tasks. However, existing visual explanation methods can not display the attention flow hidden inside the inner structure of ViT models, which explains how the final attention regions are formed inside a ViT for its decision-making. In this paper, a novel visual explanation approach, Dynamic Accumulated Attention Map (DAAM), is proposed to provide a tool that can visualize, for the first time, the attention flow from the top to the bottom through ViT networks. To this end, a novel decomposition module is proposed to construct and store the spatial feature information by unlocking the [class] token generated by the self-attention module of each ViT block. The module can also obtain the channel importance coefficients by decomposing the classification score for supervised ViT models. Because of the lack of classification score in self-supervised ViT models, we propose dimension-wise importance weights to compute the channel importance coefficients. Such spatial features are linearly combined with the corresponding channel importance coefficients, forming the attention map for each block. The dynamic attention flow is revealed by block-wisely accumulating each attention map. The contribution of this work focuses on visualizing the evolution dynamic of the decision-making attention for any intermediate block inside a ViT model by proposing a novel decomposition module and dimension-wise importance weights. The quantitative and qualitative analysis consistently validate the effectiveness and superior capacity of the proposed DAAM for not only interpreting ViT models with the fully-connected layers as the classifier but also self-supervised ViT models. The code is available at https://github.com/ly9802/DynamicAccumulatedAttentionMap. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14640v1-abstract-full').style.display = 'none'; document.getElementById('2503.14640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14476">arXiv:2503.14476</a> <span> [<a href="https://arxiv.org/pdf/2503.14476">pdf</a>, <a href="https://arxiv.org/format/2503.14476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DAPO: An Open-Source LLM Reinforcement Learning System at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qiying Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruofei Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yufeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+X">Xiaochen Zuo</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yu Yue</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tiantian Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjun Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haibin Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiqi Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bole Ma</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+G">Guangming Sheng</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+Y">Yuxuan Tong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hang Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinhua Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaze Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiangjie Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hongli Yu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Weinan Dai</a> , et al. (10 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14476v1-abstract-short" style="display: inline;"> Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the $\textbf{D}$ecouple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14476v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14476v1-abstract-full" style="display: none;"> Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the $\textbf{D}$ecoupled Clip and $\textbf{D}$ynamic s$\textbf{A}$mpling $\textbf{P}$olicy $\textbf{O}$ptimization ($\textbf{DAPO}$) algorithm, and fully open-source a state-of-the-art large-scale RL system that achieves 50 points on AIME 2024 using Qwen2.5-32B base model. Unlike previous works that withhold training details, we introduce four key techniques of our algorithm that make large-scale LLM RL a success. In addition, we open-source our training code, which is built on the verl framework, along with a carefully curated and processed dataset. These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14476v1-abstract-full').style.display = 'none'; document.getElementById('2503.14476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://dapo-sia.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14195">arXiv:2503.14195</a> <span> [<a href="https://arxiv.org/pdf/2503.14195">pdf</a>, <a href="https://arxiv.org/format/2503.14195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.compenvurbsys.2025.102282">10.1016/j.compenvurbsys.2025.102282 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Mapping Urban Villages in China: Progress and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+R">Rui Cao</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+W">Wei Tu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongsheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14195v1-abstract-short" style="display: inline;"> The shift toward high-quality urbanization has brought increased attention to the issue of "urban villages", which has become a prominent social problem in China. However, there is a lack of available geospatial data on urban villages, making it crucial to prioritize urban village mapping. In order to assess the current progress in urban village mapping and identify challenges and future direction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14195v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14195v1-abstract-full" style="display: none;"> The shift toward high-quality urbanization has brought increased attention to the issue of "urban villages", which has become a prominent social problem in China. However, there is a lack of available geospatial data on urban villages, making it crucial to prioritize urban village mapping. In order to assess the current progress in urban village mapping and identify challenges and future directions, we have conducted a comprehensive review, which to the best of our knowledge is the first of its kind in this field. Our review begins by providing a clear context for urban villages and elaborating the method for literature review, then summarizes the study areas, data sources, and approaches used for urban village mapping in China. We also address the challenges and future directions for further research. Through thorough investigation, we find that current studies only cover very limited study areas and periods and lack sufficient investigation into the scalability, transferability, and interpretability of identification approaches due to the challenges in concept fuzziness and variances, spatial heterogeneity and variances of urban villages, and data availability. Future research can complement and further the current research in the following potential directions in order to achieve large-area mapping across the whole nation... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14195v1-abstract-full').style.display = 'none'; document.getElementById('2503.14195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated review at https://github.com/rui-research/urban-village-review</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computers, Environment and Urban Systems, 119, 102282 (2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14074">arXiv:2503.14074</a> <span> [<a href="https://arxiv.org/pdf/2503.14074">pdf</a>, <a href="https://arxiv.org/format/2503.14074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2023.3286278">10.1109/TMM.2023.3286278 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Limb-Aware Virtual Try-On Network with Progressive Clothing Warping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengping Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaoyu Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weigang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiangyuan Lan</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">Hongxun Yao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14074v1-abstract-short" style="display: inline;"> Image-based virtual try-on aims to transfer an in-shop clothing image to a person image. Most existing methods adopt a single global deformation to perform clothing warping directly, which lacks fine-grained modeling of in-shop clothing and leads to distorted clothing appearance. In addition, existing methods usually fail to generate limb details well because they are limited by the used clothing-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14074v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14074v1-abstract-full" style="display: none;"> Image-based virtual try-on aims to transfer an in-shop clothing image to a person image. Most existing methods adopt a single global deformation to perform clothing warping directly, which lacks fine-grained modeling of in-shop clothing and leads to distorted clothing appearance. In addition, existing methods usually fail to generate limb details well because they are limited by the used clothing-agnostic person representation without referring to the limb textures of the person image. To address these problems, we propose Limb-aware Virtual Try-on Network named PL-VTON, which performs fine-grained clothing warping progressively and generates high-quality try-on results with realistic limb details. Specifically, we present Progressive Clothing Warping (PCW) that explicitly models the location and size of in-shop clothing and utilizes a two-stage alignment strategy to progressively align the in-shop clothing with the human body. Moreover, a novel gravity-aware loss that considers the fit of the person wearing clothing is adopted to better handle the clothing edges. Then, we design Person Parsing Estimator (PPE) with a non-limb target parsing map to semantically divide the person into various regions, which provides structural constraints on the human body and therefore alleviates texture bleeding between clothing and body regions. Finally, we introduce Limb-aware Texture Fusion (LTF) that focuses on generating realistic details in limb regions, where a coarse try-on result is first generated by fusing the warped clothing image with the person image, then limb textures are further fused with the coarse result under limb-aware guidance to refine limb details. Extensive experiments demonstrate that our PL-VTON outperforms the state-of-the-art methods both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14074v1-abstract-full').style.display = 'none'; document.getElementById('2503.14074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Multimedia (TMM). The code is available at https://github.com/aipixel/PL-VTONv2</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> in IEEE Transactions on Multimedia, vol. 26, pp. 1731-1746, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13806">arXiv:2503.13806</a> <span> [<a href="https://arxiv.org/pdf/2503.13806">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Organ-aware Multi-scale Medical Image Segmentation Using Text Prompt Engineering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziyang Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+M">Mengnan He</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiancheng Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13806v1-abstract-short" style="display: inline;"> Accurate segmentation is essential for effective treatment planning and disease monitoring. Existing medical image segmentation methods predominantly rely on uni-modal visual inputs, such as images or videos, requiring labor-intensive manual annotations. Additionally, medical imaging techniques capture multiple intertwined organs within a single scan, further complicating segmentation accuracy. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13806v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13806v1-abstract-full" style="display: none;"> Accurate segmentation is essential for effective treatment planning and disease monitoring. Existing medical image segmentation methods predominantly rely on uni-modal visual inputs, such as images or videos, requiring labor-intensive manual annotations. Additionally, medical imaging techniques capture multiple intertwined organs within a single scan, further complicating segmentation accuracy. To address these challenges, MedSAM, a large-scale medical segmentation model based on the Segment Anything Model (SAM), was developed to enhance segmentation accuracy by integrating image features with user-provided prompts. While MedSAM has demonstrated strong performance across various medical segmentation tasks, it primarily relies on geometric prompts (e.g., points and bounding boxes) and lacks support for text-based prompts, which could help specify subtle or ambiguous anatomical structures. To overcome these limitations, we propose the Organ-aware Multi-scale Text-guided Medical Image Segmentation Model (OMT-SAM) for multi-organ segmentation. Our approach introduces CLIP encoders as a novel image-text prompt encoder, operating with the geometric prompt encoder to provide informative contextual guidance. We pair descriptive textual prompts with corresponding images, processing them through pre-trained CLIP encoders and a cross-attention mechanism to generate fused image-text embeddings. Additionally, we extract multi-scale visual features from MedSAM, capturing fine-grained anatomical details at different levels of granularity. We evaluate OMT-SAM on the FLARE 2021 dataset, benchmarking its performance against existing segmentation methods. Empirical results demonstrate that OMT-SAM achieves a mean Dice Similarity Coefficient of 0.937, outperforming MedSAM (0.893) and other segmentation models, highlighting its superior capability in handling complex medical image segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13806v1-abstract-full').style.display = 'none'; document.getElementById('2503.13806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13563">arXiv:2503.13563</a> <span> [<a href="https://arxiv.org/pdf/2503.13563">pdf</a>, <a href="https://arxiv.org/format/2503.13563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MES-RAG: Bringing Multi-modal, Entity-Storage, and Secure Enhancements to RAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+P">Pingyu Wu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+D">Daiheng Gao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jing Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huimin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wenbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13563v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) improves Large Language Models (LLMs) by using external knowledge, but it struggles with precise entity information retrieval. In this paper, we proposed MES-RAG framework, which enhances entity-specific query handling and provides accurate, secure, and consistent responses. MES-RAG introduces proactive security measures that ensure system integrity by applying… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13563v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13563v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) improves Large Language Models (LLMs) by using external knowledge, but it struggles with precise entity information retrieval. In this paper, we proposed MES-RAG framework, which enhances entity-specific query handling and provides accurate, secure, and consistent responses. MES-RAG introduces proactive security measures that ensure system integrity by applying protections prior to data access. Additionally, the system supports real-time multi-modal outputs, including text, images, audio, and video, seamlessly integrating into existing RAG architectures. Experimental results demonstrate that MES-RAG significantly improves both accuracy and recall, highlighting its effectiveness in advancing the security and utility of question-answering, increasing accuracy to 0.83 (+0.25) on targeted task. Our code and data are available at https://github.com/wpydcr/MES-RAG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13563v1-abstract-full').style.display = 'none'; document.getElementById('2503.13563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13548">arXiv:2503.13548</a> <span> [<a href="https://arxiv.org/pdf/2503.13548">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fuzzy Rule-based Differentiable Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhaohong Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanjin Wang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+K">Kup-Sze Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13548v1-abstract-short" style="display: inline;"> Representation learning has emerged as a crucial focus in machine and deep learning, involving the extraction of meaningful and useful features and patterns from the input data, thereby enhancing the performance of various downstream tasks such as classification, clustering, and prediction. Current mainstream representation learning methods primarily rely on non-linear data mining techniques such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13548v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13548v1-abstract-full" style="display: none;"> Representation learning has emerged as a crucial focus in machine and deep learning, involving the extraction of meaningful and useful features and patterns from the input data, thereby enhancing the performance of various downstream tasks such as classification, clustering, and prediction. Current mainstream representation learning methods primarily rely on non-linear data mining techniques such as kernel methods and deep neural networks to extract abstract knowledge from complex datasets. However, most of these methods are black-box, lacking transparency and interpretability in the learning process, which constrains their practical utility. To this end, this paper introduces a novel representation learning method grounded in an interpretable fuzzy rule-based model. Specifically, it is built upon the Takagi-Sugeno-Kang fuzzy system (TSK-FS) to initially map input data to a high-dimensional fuzzy feature space through the antecedent part of the TSK-FS. Subsequently, a novel differentiable optimization method is proposed for the consequence part learning which can preserve the model's interpretability and transparency while further exploring the nonlinear relationships within the data. This optimization method retains the essence of traditional optimization, with certain parts of the process parameterized corresponding differentiable modules constructed, and a deep optimization process implemented. Consequently, this method not only enhances the model's performance but also ensures its interpretability. Moreover, a second-order geometry preservation method is introduced to further improve the robustness of the proposed method. Extensive experiments conducted on various benchmark datasets validate the superiority of the proposed method, highlighting its potential for advancing representation learning methodologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13548v1-abstract-full').style.display = 'none'; document.getElementById('2503.13548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13435">arXiv:2503.13435</a> <span> [<a href="https://arxiv.org/pdf/2503.13435">pdf</a>, <a href="https://arxiv.org/format/2503.13435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WideRange4D: Enabling High-Quality 4D Reconstruction with Wide-Range Movements and Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+L">Ling Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kaixin Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Juanxi Tian</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Mingbao Lin</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+H">Hongjuan Pei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13435v1-abstract-short" style="display: inline;"> With the rapid development of 3D reconstruction technology, research in 4D reconstruction is also advancing, existing 4D reconstruction methods can generate high-quality 4D scenes. However, due to the challenges in acquiring multi-view video data, the current 4D reconstruction benchmarks mainly display actions performed in place, such as dancing, within limited scenarios. In practical scenarios, m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13435v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13435v1-abstract-full" style="display: none;"> With the rapid development of 3D reconstruction technology, research in 4D reconstruction is also advancing, existing 4D reconstruction methods can generate high-quality 4D scenes. However, due to the challenges in acquiring multi-view video data, the current 4D reconstruction benchmarks mainly display actions performed in place, such as dancing, within limited scenarios. In practical scenarios, many scenes involve wide-range spatial movements, highlighting the limitations of existing 4D reconstruction datasets. Additionally, existing 4D reconstruction methods rely on deformation fields to estimate the dynamics of 3D objects, but deformation fields struggle with wide-range spatial movements, which limits the ability to achieve high-quality 4D scene reconstruction with wide-range spatial movements. In this paper, we focus on 4D scene reconstruction with significant object spatial movements and propose a novel 4D reconstruction benchmark, WideRange4D. This benchmark includes rich 4D scene data with large spatial variations, allowing for a more comprehensive evaluation of the generation capabilities of 4D generation methods. Furthermore, we introduce a new 4D reconstruction method, Progress4D, which generates stable and high-quality 4D results across various complex 4D scene reconstruction tasks. We conduct both quantitative and qualitative comparison experiments on WideRange4D, showing that our Progress4D outperforms existing state-of-the-art 4D reconstruction methods. Project: https://github.com/Gen-Verse/WideRange4D <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13435v1-abstract-full').style.display = 'none'; document.getElementById('2503.13435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project: https://github.com/Gen-Verse/WideRange4D</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13109">arXiv:2503.13109</a> <span> [<a href="https://arxiv.org/pdf/2503.13109">pdf</a>, <a href="https://arxiv.org/format/2503.13109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Code-Driven Inductive Synthesis: Enhancing Reasoning Abilities of Large Language Models with Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kedi Chen</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhikai Lei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qin Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13109v1-abstract-short" style="display: inline;"> Large language models make remarkable progress in reasoning capabilities. Existing works focus mainly on deductive reasoning tasks (e.g., code and math), while another type of reasoning mode that better aligns with human learning, inductive reasoning, is not well studied. We attribute the reason to the fact that obtaining high-quality process supervision data is challenging for inductive reasoning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13109v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13109v1-abstract-full" style="display: none;"> Large language models make remarkable progress in reasoning capabilities. Existing works focus mainly on deductive reasoning tasks (e.g., code and math), while another type of reasoning mode that better aligns with human learning, inductive reasoning, is not well studied. We attribute the reason to the fact that obtaining high-quality process supervision data is challenging for inductive reasoning. Towards this end, we novelly employ number sequences as the source of inductive reasoning data. We package sequences into algorithmic problems to find the general term of each sequence through a code solution. In this way, we can verify whether the code solution holds for any term in the current sequence, and inject case-based supervision signals by using code unit tests. We build a sequence synthetic data pipeline and form a training dataset CodeSeq. Experimental results show that the models tuned with CodeSeq improve on both code and comprehensive reasoning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13109v1-abstract-full').style.display = 'none'; document.getElementById('2503.13109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12999">arXiv:2503.12999</a> <span> [<a href="https://arxiv.org/pdf/2503.12999">pdf</a>, <a href="https://arxiv.org/format/2503.12999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Concept-as-Tree: Synthetic Data is All You Need for VLM Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+R">Ruichuan An</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+K">Kai Zeng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Sihan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Huitong Ji</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qizhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yulin Luo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hao Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12999v2-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have demonstrated exceptional performance in various multi-modal tasks. Recently, there has been an increasing interest in improving the personalization capabilities of VLMs. To better integrate user-provided concepts into VLMs, many methods use positive and negative samples to fine-tune these models. However, the scarcity of user-provided positive samples and the low… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12999v2-abstract-full').style.display = 'inline'; document.getElementById('2503.12999v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12999v2-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have demonstrated exceptional performance in various multi-modal tasks. Recently, there has been an increasing interest in improving the personalization capabilities of VLMs. To better integrate user-provided concepts into VLMs, many methods use positive and negative samples to fine-tune these models. However, the scarcity of user-provided positive samples and the low quality of retrieved negative samples pose challenges for fine-tuning. To reveal the relationship between sample and model performance, we systematically investigate the impact of positive and negative samples (easy and hard) and their diversity on VLM personalization tasks. Based on the detailed analysis, we introduce Concept-as-Tree (CaT), which represents a concept as a tree structure, thereby enabling the data generation of positive and negative samples with varying difficulty and diversity for VLM personalization. With a well-designed data filtering strategy, our CaT framework can ensure the quality of generated data, constituting a powerful pipeline. We perform thorough experiments with various VLM personalization baselines to assess the effectiveness of the pipeline, alleviating the lack of positive samples and the low quality of negative samples. Our results demonstrate that CaT equipped with the proposed data filter significantly enhances the personalization capabilities of VLMs across the MyVLM, Yo'LLaVA, and MC-LLaVA datasets. To our knowledge, this work is the first controllable synthetic data pipeline for VLM personalization. The code is released at $\href{https://github.com/zengkaiya/CaT}{\text{https://github.com/zengkaiya/CaT}}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12999v2-abstract-full').style.display = 'none'; document.getElementById('2503.12999v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is released at $\href{https://github.com/zengkaiya/CaT}{\text{https://github.com/zengkaiya/CaT}}$</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12764">arXiv:2503.12764</a> <span> [<a href="https://arxiv.org/pdf/2503.12764">pdf</a>, <a href="https://arxiv.org/format/2503.12764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decouple to Reconstruct: High Quality UHD Restoration via Active Feature Disentanglement and Reversible Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yidi Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuxin Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jie Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xueyang Fu</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12764v1-abstract-short" style="display: inline;"> Ultra-high-definition (UHD) image restoration often faces computational bottlenecks and information loss due to its extremely high resolution. Existing studies based on Variational Autoencoders (VAE) improve efficiency by transferring the image restoration process from pixel space to latent space. However, degraded components are inherently coupled with background elements in degraded images, both… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12764v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12764v1-abstract-full" style="display: none;"> Ultra-high-definition (UHD) image restoration often faces computational bottlenecks and information loss due to its extremely high resolution. Existing studies based on Variational Autoencoders (VAE) improve efficiency by transferring the image restoration process from pixel space to latent space. However, degraded components are inherently coupled with background elements in degraded images, both information loss during compression and information gain during compensation remain uncontrollable. These lead to restored images often exhibiting image detail loss and incomplete degradation removal. To address this issue, we propose a Controlled Differential Disentangled VAE, which utilizes Hierarchical Contrastive Disentanglement Learning and an Orthogonal Gated Projection Module to guide the VAE to actively discard easily recoverable background information while encoding more difficult-to-recover degraded information into the latent space. Additionally, we design a Complex Invertible Multiscale Fusion Network to handle background features, ensuring their consistency, and utilize a latent space restoration network to transform the degraded latent features, leading to more accurate restoration results. Extensive experimental results demonstrate that our method effectively alleviates the information loss problem in VAE models while ensuring computational efficiency, significantly improving the quality of UHD image restoration, and achieves state-of-the-art results in six UHD restoration tasks with only 1M parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12764v1-abstract-full').style.display = 'none'; document.getElementById('2503.12764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12497">arXiv:2503.12497</a> <span> [<a href="https://arxiv.org/pdf/2503.12497">pdf</a>, <a href="https://arxiv.org/format/2503.12497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Defense Against Model Stealing Based on Account-Aware Distribution Discrepancy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jian-Ping Mei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tiantian Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12497v1-abstract-short" style="display: inline;"> Malicious users attempt to replicate commercial models functionally at low cost by training a clone model with query responses. It is challenging to timely prevent such model-stealing attacks to achieve strong protection and maintain utility. In this paper, we propose a novel non-parametric detector called Account-aware Distribution Discrepancy (ADD) to recognize queries from malicious users by le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12497v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12497v1-abstract-full" style="display: none;"> Malicious users attempt to replicate commercial models functionally at low cost by training a clone model with query responses. It is challenging to timely prevent such model-stealing attacks to achieve strong protection and maintain utility. In this paper, we propose a novel non-parametric detector called Account-aware Distribution Discrepancy (ADD) to recognize queries from malicious users by leveraging account-wise local dependency. We formulate each class as a Multivariate Normal distribution (MVN) in the feature space and measure the malicious score as the sum of weighted class-wise distribution discrepancy. The ADD detector is combined with random-based prediction poisoning to yield a plug-and-play defense module named D-ADD for image classification models. Results of extensive experimental studies show that D-ADD achieves strong defense against different types of attacks with little interference in serving benign users for both soft and hard-label settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12497v1-abstract-full').style.display = 'none'; document.getElementById('2503.12497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures, published in AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12348">arXiv:2503.12348</a> <span> [<a href="https://arxiv.org/pdf/2503.12348">pdf</a>, <a href="https://arxiv.org/format/2503.12348">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProbDiffFlow: An Efficient Learning-Free Framework for Probabilistic Single-Image Optical Flow Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Mo Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanmeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Campbell%2C+D">Dylan Campbell</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Long Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuemin Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12348v1-abstract-short" style="display: inline;"> This paper studies optical flow estimation, a critical task in motion analysis with applications in autonomous navigation, action recognition, and film production. Traditional optical flow methods require consecutive frames, which are often unavailable due to limitations in data acquisition or real-world scene disruptions. Thus, single-frame optical flow estimation is emerging in the literature. H… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12348v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12348v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12348v1-abstract-full" style="display: none;"> This paper studies optical flow estimation, a critical task in motion analysis with applications in autonomous navigation, action recognition, and film production. Traditional optical flow methods require consecutive frames, which are often unavailable due to limitations in data acquisition or real-world scene disruptions. Thus, single-frame optical flow estimation is emerging in the literature. However, existing single-frame approaches suffer from two major limitations: (1) they rely on labeled training data, making them task-specific, and (2) they produce deterministic predictions, failing to capture motion uncertainty. To overcome these challenges, we propose ProbDiffFlow, a training-free framework that estimates optical flow distributions from a single image. Instead of directly predicting motion, ProbDiffFlow follows an estimation-by-synthesis paradigm: it first generates diverse plausible future frames using a diffusion-based model, then estimates motion from these synthesized samples using a pre-trained optical flow model, and finally aggregates the results into a probabilistic flow distribution. This design eliminates the need for task-specific training while capturing multiple plausible motions. Experiments on both synthetic and real-world datasets demonstrate that ProbDiffFlow achieves superior accuracy, diversity, and efficiency, outperforming existing single-image and two-frame baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12348v1-abstract-full').style.display = 'none'; document.getElementById('2503.12348v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12094">arXiv:2503.12094</a> <span> [<a href="https://arxiv.org/pdf/2503.12094">pdf</a>, <a href="https://arxiv.org/format/2503.12094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> E-SAM: Training-Free Segment Every Entity Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+D">Dingwen Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12094v1-abstract-short" style="display: inline;"> Entity Segmentation (ES) aims at identifying and segmenting distinct entities within an image without the need for predefined class labels. This characteristic makes ES well-suited to open-world applications with adaptation to diverse and dynamically changing environments, where new and previously unseen entities may appear frequently. Existing ES methods either require large annotated datasets or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12094v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12094v1-abstract-full" style="display: none;"> Entity Segmentation (ES) aims at identifying and segmenting distinct entities within an image without the need for predefined class labels. This characteristic makes ES well-suited to open-world applications with adaptation to diverse and dynamically changing environments, where new and previously unseen entities may appear frequently. Existing ES methods either require large annotated datasets or high training costs, limiting their scalability and adaptability. Recently, the Segment Anything Model (SAM), especially in its Automatic Mask Generation (AMG) mode, has shown potential for holistic image segmentation. However, it struggles with over-segmentation and under-segmentation, making it less effective for ES. In this paper, we introduce E-SAM, a novel training-free framework that exhibits exceptional ES capability. Specifically, we first propose Multi-level Mask Generation (MMG) that hierarchically processes SAM's AMG outputs to generate reliable object-level masks while preserving fine details at other levels. Entity-level Mask Refinement (EMR) then refines these object-level masks into accurate entity-level masks. That is, it separates overlapping masks to address the redundancy issues inherent in SAM's outputs and merges similar masks by evaluating entity-level consistency. Lastly, Under-Segmentation Refinement (USR) addresses under-segmentation by generating additional high-confidence masks fused with EMR outputs to produce the final ES map. These three modules are seamlessly optimized to achieve the best ES without additional training overhead. Extensive experiments demonstrate that E-SAM achieves state-of-the-art performance compared to prior ES methods, demonstrating a significant improvement by +30.1 on benchmark metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12094v1-abstract-full').style.display = 'none'; document.getElementById('2503.12094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11080">arXiv:2503.11080</a> <span> [<a href="https://arxiv.org/pdf/2503.11080">pdf</a>, <a href="https://arxiv.org/format/2503.11080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Joint Training And Decoding for Multilingual End-to-End Simultaneous Speech Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wuwei Huang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Renren Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+J">Jian Luan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+D">Deyi Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11080v1-abstract-short" style="display: inline;"> Recent studies on end-to-end speech translation(ST) have facilitated the exploration of multilingual end-to-end ST and end-to-end simultaneous ST. In this paper, we investigate end-to-end simultaneous speech translation in a one-to-many multilingual setting which is closer to applications in real scenarios. We explore a separate decoder architecture and a unified architecture for joint synchronous… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11080v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11080v1-abstract-full" style="display: none;"> Recent studies on end-to-end speech translation(ST) have facilitated the exploration of multilingual end-to-end ST and end-to-end simultaneous ST. In this paper, we investigate end-to-end simultaneous speech translation in a one-to-many multilingual setting which is closer to applications in real scenarios. We explore a separate decoder architecture and a unified architecture for joint synchronous training in this scenario. To further explore knowledge transfer across languages, we propose an asynchronous training strategy on the proposed unified decoder architecture. A multi-way aligned multilingual end-to-end ST dataset was curated as a benchmark testbed to evaluate our methods. Experimental results demonstrate the effectiveness of our models on the collected dataset. Our codes and data are available at: https://github.com/XiaoMi/TED-MMST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11080v1-abstract-full').style.display = 'none'; document.getElementById('2503.11080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10573">arXiv:2503.10573</a> <span> [<a href="https://arxiv.org/pdf/2503.10573">pdf</a>, <a href="https://arxiv.org/format/2503.10573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Mathematical Reasoning in DeepSeek Models: A Comparative Study of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jahin%2C+A">Afrar Jahin</a>, <a href="/search/cs?searchtype=author&query=Zidan%2C+A+H">Arif Hassan Zidan</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+Y">Yu Bao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shizhe Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10573v1-abstract-short" style="display: inline;"> With the rapid evolution of Artificial Intelligence (AI), Large Language Models (LLMs) have reshaped the frontiers of various fields, spanning healthcare, public health, engineering, science, agriculture, education, arts, humanities, and mathematical reasoning. Among these advancements, DeepSeek models have emerged as noteworthy contenders, demonstrating promising capabilities that set them apart… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10573v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10573v1-abstract-full" style="display: none;"> With the rapid evolution of Artificial Intelligence (AI), Large Language Models (LLMs) have reshaped the frontiers of various fields, spanning healthcare, public health, engineering, science, agriculture, education, arts, humanities, and mathematical reasoning. Among these advancements, DeepSeek models have emerged as noteworthy contenders, demonstrating promising capabilities that set them apart from their peers. While previous studies have conducted comparative analyses of LLMs, few have delivered a comprehensive evaluation of mathematical reasoning across a broad spectrum of LLMs. In this work, we aim to bridge this gap by conducting an in-depth comparative study, focusing on the strengths and limitations of DeepSeek models in relation to their leading counterparts. In particular, our study systematically evaluates the mathematical reasoning performance of two DeepSeek models alongside five prominent LLMs across three independent benchmark datasets. The findings reveal several key insights: 1). DeepSeek-R1 consistently achieved the highest accuracy on two of the three datasets, demonstrating strong mathematical reasoning capabilities. 2). The distilled variant of LLMs significantly underperformed compared to its peers, highlighting potential drawbacks in using distillation techniques. 3). In terms of response time, Gemini 2.0 Flash demonstrated the fastest processing speed, outperforming other models in efficiency, which is a crucial factor for real-time applications. Beyond these quantitative assessments, we delve into how architecture, training, and optimization impact LLMs' mathematical reasoning. Moreover, our study goes beyond mere performance comparison by identifying key areas for future advancements in LLM-driven mathematical reasoning. This research enhances our understanding of LLMs' mathematical reasoning and lays the groundwork for future advancements <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10573v1-abstract-full').style.display = 'none'; document.getElementById('2503.10573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10525">arXiv:2503.10525</a> <span> [<a href="https://arxiv.org/pdf/2503.10525">pdf</a>, <a href="https://arxiv.org/format/2503.10525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Efficient Precoding in XL-MIMO-AFDM System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yin Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Dazhi He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyang Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Y">Yunfeng Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tianyao Ma</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haozhi Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10525v1-abstract-short" style="display: inline;"> This paper explores the potential of affine frequency division multiplexing (AFDM) to mitigate the multiuser interference (MUI) problem by employing time-domain precoding in extremely-large-scale multiple-input multiple-output (XL-MIMO) systems. In XL-MIMO systems, user mobility significantly improves network capacity and transmission quality. Meanwhile, the robustness of AFDM to Doppler shift is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10525v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10525v1-abstract-full" style="display: none;"> This paper explores the potential of affine frequency division multiplexing (AFDM) to mitigate the multiuser interference (MUI) problem by employing time-domain precoding in extremely-large-scale multiple-input multiple-output (XL-MIMO) systems. In XL-MIMO systems, user mobility significantly improves network capacity and transmission quality. Meanwhile, the robustness of AFDM to Doppler shift is enhanced in user mobility scenarios, which further improves the system performance. However, the multicarrier nature of AFDM has attracted much attention, and it leads to a significant increase in precoding complexity. However, the serious problem is that the multicarrier use of AFDM leads to a sharp increase in precoding complexity. Therefore, we employ efficient precoding randomized Kaczmarz (rKA) to reduce the complexity overhead. Through simulation analysis, we compare the performance of XL-MIMO-AFDM and XL-MIMO orthogonal frequency division multiplexing (XL-MIMO-OFDM) in mobile scenarios, and the results show that our proposed AFDM-based XL-MIMO precoding design can be more efficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10525v1-abstract-full').style.display = 'none'; document.getElementById('2503.10525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09501">arXiv:2503.09501</a> <span> [<a href="https://arxiv.org/pdf/2503.09501">pdf</a>, <a href="https://arxiv.org/format/2503.09501">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> ReMA: Learning to Meta-think for LLMs with Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunxiang Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanjing Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+M">Mark Schmidt</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Ying Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09501v2-abstract-short" style="display: inline;"> Recent research on Reasoning of Large Language Models (LLMs) has sought to further enhance their performance by integrating meta-thinking -- enabling models to monitor, evaluate, and control their reasoning processes for more adaptive and effective problem-solving. However, current single-agent work lacks a specialized design for acquiring meta-thinking, resulting in low efficacy. To address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09501v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09501v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09501v2-abstract-full" style="display: none;"> Recent research on Reasoning of Large Language Models (LLMs) has sought to further enhance their performance by integrating meta-thinking -- enabling models to monitor, evaluate, and control their reasoning processes for more adaptive and effective problem-solving. However, current single-agent work lacks a specialized design for acquiring meta-thinking, resulting in low efficacy. To address this challenge, we introduce Reinforced Meta-thinking Agents (ReMA), a novel framework that leverages Multi-Agent Reinforcement Learning (MARL) to elicit meta-thinking behaviors, encouraging LLMs to think about thinking. ReMA decouples the reasoning process into two hierarchical agents: a high-level meta-thinking agent responsible for generating strategic oversight and plans, and a low-level reasoning agent for detailed executions. Through iterative reinforcement learning with aligned objectives, these agents explore and learn collaboration, leading to improved generalization and robustness. Experimental results demonstrate that ReMA outperforms single-agent RL baselines on complex reasoning tasks, including competitive-level mathematical benchmarks and LLM-as-a-Judge benchmarks. Comprehensive ablation studies further illustrate the evolving dynamics of each distinct agent, providing valuable insights into how the meta-thinking reasoning process enhances the reasoning capabilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09501v2-abstract-full').style.display = 'none'; document.getElementById('2503.09501v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09358">arXiv:2503.09358</a> <span> [<a href="https://arxiv.org/pdf/2503.09358">pdf</a>, <a href="https://arxiv.org/format/2503.09358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RetSTA: An LLM-Based Approach for Standardizing Clinical Fundus Image Reports </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiushen Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weihang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanruo Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ningli Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huiqi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09358v1-abstract-short" style="display: inline;"> Standardization of clinical reports is crucial for improving the quality of healthcare and facilitating data integration. The lack of unified standards, including format, terminology, and style, is a great challenge in clinical fundus diagnostic reports, which increases the difficulty for large language models (LLMs) to understand the data. To address this, we construct a bilingual standard termin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09358v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09358v1-abstract-full" style="display: none;"> Standardization of clinical reports is crucial for improving the quality of healthcare and facilitating data integration. The lack of unified standards, including format, terminology, and style, is a great challenge in clinical fundus diagnostic reports, which increases the difficulty for large language models (LLMs) to understand the data. To address this, we construct a bilingual standard terminology, containing fundus clinical terms and commonly used descriptions in clinical diagnosis. Then, we establish two models, RetSTA-7B-Zero and RetSTA-7B. RetSTA-7B-Zero, fine-tuned on an augmented dataset simulating clinical scenarios, demonstrates powerful standardization behaviors. However, it encounters a challenge of limitation to cover a wider range of diseases. To further enhance standardization performance, we build RetSTA-7B, which integrates a substantial amount of standardized data generated by RetSTA-7B-Zero along with corresponding English data, covering diverse complex clinical scenarios and achieving report-level standardization for the first time. Experimental results demonstrate that RetSTA-7B outperforms other compared LLMs in bilingual standardization task, which validates its superior performance and generalizability. The checkpoints are available at https://github.com/AB-Story/RetSTA-7B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09358v1-abstract-full').style.display = 'none'; document.getElementById('2503.09358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08104">arXiv:2503.08104</a> <span> [<a href="https://arxiv.org/pdf/2503.08104">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Functional Unit: A New Perspective on Materials Science Research Paradigms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+C">Caichao Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weishu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08104v1-abstract-short" style="display: inline;"> New materials have long marked the civilization level, serving as an impetus for technological progress and societal transformation. The classic structure-property correlations were key of materials science and engineering. However, the knowledge of materials faces significant challenges in adapting to exclusively data-driven approaches for new material discovery. This perspective introduces the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08104v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08104v1-abstract-full" style="display: none;"> New materials have long marked the civilization level, serving as an impetus for technological progress and societal transformation. The classic structure-property correlations were key of materials science and engineering. However, the knowledge of materials faces significant challenges in adapting to exclusively data-driven approaches for new material discovery. This perspective introduces the concepts of functional units (FUs) to fill the gap in understanding of material structure-property correlations and knowledge inheritance as the "composition-microstructure" paradigm transitions to a data-driven AI paradigm transitions. Firstly, we provide a bird's-eye view of the research paradigm evolution from early "process-structure-properties-performance" to contemporary data-driven AI new trend. Next, we highlight recent advancements in the characterization of functional units across diverse material systems, emphasizing their critical role in multiscale material design. Finally, we discuss the integration of functional units into the new AI-driven paradigm of materials science, addressing both opportunities and challenges in computational materials innovation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08104v1-abstract-full').style.display = 'none'; document.getElementById('2503.08104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+W&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>