Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 7,748 results for author: <span class="mathjax">Wang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21679">arXiv:2503.21679</a> <span> [<a href="https://arxiv.org/pdf/2503.21679">pdf</a>, <a href="https://arxiv.org/format/2503.21679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> JiraiBench: A Bilingual Benchmark for Evaluating Large Language Models' Detection of Human Self-Destructive Behavior Content in Jirai Community </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yunze Xiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tingyu He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L+Z">Lionel Z. Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yiming Ma</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xingyu Song</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaohang Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+I">Irene Li</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+K+C">Ka Chung Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21679v1-abstract-short" style="display: inline;"> This paper introduces JiraiBench, the first bilingual benchmark for evaluating large language models' effectiveness in detecting self-destructive content across Chinese and Japanese social media communities. Focusing on the transnational "Jirai" (landmine) online subculture that encompasses multiple forms of self-destructive behaviors including drug overdose, eating disorders, and self-harm, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21679v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21679v1-abstract-full" style="display: none;"> This paper introduces JiraiBench, the first bilingual benchmark for evaluating large language models' effectiveness in detecting self-destructive content across Chinese and Japanese social media communities. Focusing on the transnational "Jirai" (landmine) online subculture that encompasses multiple forms of self-destructive behaviors including drug overdose, eating disorders, and self-harm, we present a comprehensive evaluation framework incorporating both linguistic and cultural dimensions. Our dataset comprises 10,419 Chinese posts and 5,000 Japanese posts with multidimensional annotation along three behavioral categories, achieving substantial inter-annotator agreement. Experimental evaluations across four state-of-the-art models reveal significant performance variations based on instructional language, with Japanese prompts unexpectedly outperforming Chinese prompts when processing Chinese content. This emergent cross-cultural transfer suggests that cultural proximity can sometimes outweigh linguistic similarity in detection tasks. Cross-lingual transfer experiments with fine-tuned models further demonstrate the potential for knowledge transfer between these language systems without explicit target language training. These findings highlight the need for culturally-informed approaches to multilingual content moderation and provide empirical evidence for the importance of cultural context in developing more effective detection systems for vulnerable online communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21679v1-abstract-full').style.display = 'none'; document.getElementById('2503.21679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 1 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21505">arXiv:2503.21505</a> <span> [<a href="https://arxiv.org/pdf/2503.21505">pdf</a>, <a href="https://arxiv.org/format/2503.21505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fine-Grained Evaluation of Large Vision-Language Models in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yue Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+M">Meng Tian</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhenyu Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiangtong Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dechang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haiqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zining Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yueyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Z">Zhiwei Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinhai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21505v1-abstract-short" style="display: inline;"> Existing benchmarks for Vision-Language Model (VLM) on autonomous driving (AD) primarily assess interpretability through open-form visual question answering (QA) within coarse-grained tasks, which remain insufficient to assess capabilities in complex driving scenarios. To this end, we introduce $\textbf{VLADBench}$, a challenging and fine-grained dataset featuring close-form QAs that progress from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21505v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21505v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21505v1-abstract-full" style="display: none;"> Existing benchmarks for Vision-Language Model (VLM) on autonomous driving (AD) primarily assess interpretability through open-form visual question answering (QA) within coarse-grained tasks, which remain insufficient to assess capabilities in complex driving scenarios. To this end, we introduce $\textbf{VLADBench}$, a challenging and fine-grained dataset featuring close-form QAs that progress from static foundational knowledge and elements to advanced reasoning for dynamic on-road situations. The elaborate $\textbf{VLADBench}$ spans 5 key domains: Traffic Knowledge Understanding, General Element Recognition, Traffic Graph Generation, Target Attribute Comprehension, and Ego Decision-Making and Planning. These domains are further broken down into 11 secondary aspects and 29 tertiary tasks for a granular evaluation. A thorough assessment of general and domain-specific (DS) VLMs on this benchmark reveals both their strengths and critical limitations in AD contexts. To further exploit the cognitive and reasoning interactions among the 5 domains for AD understanding, we start from a small-scale VLM and train the DS models on individual domain datasets (collected from 1.4M DS QAs across public sources). The experimental results demonstrate that the proposed benchmark provides a crucial step toward a more comprehensive assessment of VLMs in AD, paving the way for the development of more cognitively sophisticated and reasoning-capable AD systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21505v1-abstract-full').style.display = 'none'; document.getElementById('2503.21505v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21380">arXiv:2503.21380</a> <span> [<a href="https://arxiv.org/pdf/2503.21380">pdf</a>, <a href="https://arxiv.org/format/2503.21380">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoxiang Sun</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yingqian Min</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lei Fang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21380v1-abstract-short" style="display: inline;"> In recent years, the rapid development of large reasoning models has resulted in the saturation of existing benchmarks for evaluating mathematical reasoning, highlighting the urgent need for more challenging and rigorous evaluation frameworks. To address this gap, we introduce OlymMATH, a novel Olympiad-level mathematical benchmark, designed to rigorously test the complex reasoning capabilities of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21380v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21380v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21380v1-abstract-full" style="display: none;"> In recent years, the rapid development of large reasoning models has resulted in the saturation of existing benchmarks for evaluating mathematical reasoning, highlighting the urgent need for more challenging and rigorous evaluation frameworks. To address this gap, we introduce OlymMATH, a novel Olympiad-level mathematical benchmark, designed to rigorously test the complex reasoning capabilities of LLMs. OlymMATH features 200 meticulously curated problems, each manually verified and available in parallel English and Chinese versions. The problems are systematically organized into two distinct difficulty tiers: (1) AIME-level problems (easy) that establish a baseline for mathematical reasoning assessment, and (2) significantly more challenging problems (hard) designed to push the boundaries of current state-of-the-art models. In our benchmark, these problems span four core mathematical fields, each including a verifiable numerical solution to enable objective, rule-based evaluation. Empirical results underscore the significant challenge presented by OlymMATH, with state-of-the-art models including DeepSeek-R1 and OpenAI's o3-mini demonstrating notably limited accuracy on the hard subset. Furthermore, the benchmark facilitates comprehensive bilingual assessment of mathematical reasoning abilities-a critical dimension that remains largely unaddressed in mainstream mathematical reasoning benchmarks. We release the OlymMATH benchmark at the STILL project: https://github.com/RUCAIBox/Slow_Thinking_with_LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21380v1-abstract-full').style.display = 'none'; document.getElementById('2503.21380v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report on Slow Thinking with LLMs: Evaluation Benchmark</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21254">arXiv:2503.21254</a> <span> [<a href="https://arxiv.org/pdf/2503.21254">pdf</a>, <a href="https://arxiv.org/format/2503.21254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Vision-to-Music Generation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaokai Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+C">Chenxi Bao</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jingrui Han</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yang Yue</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yihong Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+V+S">Victor Shea-Jay Huang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yue Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21254v1-abstract-short" style="display: inline;"> Vision-to-music Generation, including video-to-music and image-to-music tasks, is a significant branch of multimodal artificial intelligence demonstrating vast application prospects in fields such as film scoring, short video creation, and dance music synthesis. However, compared to the rapid development of modalities like text and images, research in vision-to-music is still in its preliminary st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21254v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21254v1-abstract-full" style="display: none;"> Vision-to-music Generation, including video-to-music and image-to-music tasks, is a significant branch of multimodal artificial intelligence demonstrating vast application prospects in fields such as film scoring, short video creation, and dance music synthesis. However, compared to the rapid development of modalities like text and images, research in vision-to-music is still in its preliminary stage due to its complex internal structure and the difficulty of modeling dynamic relationships with video. Existing surveys focus on general music generation without comprehensive discussion on vision-to-music. In this paper, we systematically review the research progress in the field of vision-to-music generation. We first analyze the technical characteristics and core challenges for three input types: general videos, human movement videos, and images, as well as two output types of symbolic music and audio music. We then summarize the existing methodologies on vision-to-music generation from the architecture perspective. A detailed review of common datasets and evaluation metrics is provided. Finally, we discuss current challenges and promising directions for future research. We hope our survey can inspire further innovation in vision-to-music generation and the broader field of multimodal generation in academic research and industrial applications. To follow latest works and foster further innovation in this field, we are continuously maintaining a GitHub repository at https://github.com/wzk1015/Awesome-Vision-to-Music-Generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21254v1-abstract-full').style.display = 'none'; document.getElementById('2503.21254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21227">arXiv:2503.21227</a> <span> [<a href="https://arxiv.org/pdf/2503.21227">pdf</a>, <a href="https://arxiv.org/format/2503.21227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-CMoE: Towards Continual Mixture of Experts for Large Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hengyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqin Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qixin Sun</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kaiyou Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yilin Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaolin Hu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qingpei Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Si Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21227v1-abstract-short" style="display: inline;"> Although applying Mixture of Experts to large language models for learning new tasks is widely regarded as an effective strategy for continuous learning, there still remain two major challenges: (1) As the number of tasks grows, simple parameter expansion strategies can lead to excessively large models. (2) Modifying the parameters of the existing router results in the erosion of previously acquir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21227v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21227v1-abstract-full" style="display: none;"> Although applying Mixture of Experts to large language models for learning new tasks is widely regarded as an effective strategy for continuous learning, there still remain two major challenges: (1) As the number of tasks grows, simple parameter expansion strategies can lead to excessively large models. (2) Modifying the parameters of the existing router results in the erosion of previously acquired knowledge. In this paper, we present an innovative framework named LLaVA-CMoE, which is a continuous Mixture of Experts (MoE) architecture without any replay data. Specifically, we have developed a method called Probe-Guided Knowledge Extension (PGKE), which employs probe experts to assess whether additional knowledge is required for a specific layer. This approach enables the model to adaptively expand its network parameters based on task distribution, thereby significantly improving the efficiency of parameter expansion. Additionally, we introduce a hierarchical routing algorithm called Probabilistic Task Locator (PTL), where high-level routing captures inter-task information and low-level routing focuses on intra-task details, ensuring that new task experts do not interfere with existing ones. Our experiments shows that our efficient architecture has substantially improved model performance on the Coin benchmark while maintaining a reasonable parameter count. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21227v1-abstract-full').style.display = 'none'; document.getElementById('2503.21227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21122">arXiv:2503.21122</a> <span> [<a href="https://arxiv.org/pdf/2503.21122">pdf</a>, <a href="https://arxiv.org/format/2503.21122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One Snapshot is All You Need: A Generalized Method for mmWave Signal Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Teng Huang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenxin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Cui Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kun Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+W">Wei Xi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21122v1-abstract-short" style="display: inline;"> Wireless sensing systems, particularly those using mmWave technology, offer distinct advantages over traditional vision-based approaches, such as enhanced privacy and effectiveness in poor lighting conditions. These systems, leveraging FMCW signals, have shown success in human-centric applications like localization, gesture recognition, and so on. However, comprehensive mmWave datasets for diverse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21122v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21122v1-abstract-full" style="display: none;"> Wireless sensing systems, particularly those using mmWave technology, offer distinct advantages over traditional vision-based approaches, such as enhanced privacy and effectiveness in poor lighting conditions. These systems, leveraging FMCW signals, have shown success in human-centric applications like localization, gesture recognition, and so on. However, comprehensive mmWave datasets for diverse applications are scarce, often constrained by pre-processed signatures (e.g., point clouds or RA heatmaps) and inconsistent annotation formats. To overcome these limitations, we propose mmGen, a novel and generalized framework tailored for full-scene mmWave signal generation. By constructing physical signal transmission models, mmGen synthesizes human-reflected and environment-reflected mmWave signals from the constructed 3D meshes. Additionally, we incorporate methods to account for material properties, antenna gains, and multipath reflections, enhancing the realism of the synthesized signals. We conduct extensive experiments using a prototype system with commercial mmWave devices and Kinect sensors. The results show that the average similarity of Range-Angle and micro-Doppler signatures between the synthesized and real-captured signals across three different environments exceeds 0.91 and 0.89, respectively, demonstrating the effectiveness and practical applicability of mmGen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21122v1-abstract-full').style.display = 'none'; document.getElementById('2503.21122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE INFOCOM 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21094">arXiv:2503.21094</a> <span> [<a href="https://arxiv.org/pdf/2503.21094">pdf</a>, <a href="https://arxiv.org/format/2503.21094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713739">10.1145/3706598.3713739 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> GazeSwipe: Enhancing Mobile Touchscreen Reachability through Seamless Gaze and Finger-Swipe Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhuojiang Cai</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jingkai Hong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhimin Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+F">Feng Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21094v1-abstract-short" style="display: inline;"> Smartphones with large screens provide users with increased display and interaction space but pose challenges in reaching certain areas with the thumb when using the device with one hand. To address this, we introduce GazeSwipe, a multimodal interaction technique that combines eye gaze with finger-swipe gestures, enabling intuitive and low-friction reach on mobile touchscreens. Specifically, we de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21094v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21094v1-abstract-full" style="display: none;"> Smartphones with large screens provide users with increased display and interaction space but pose challenges in reaching certain areas with the thumb when using the device with one hand. To address this, we introduce GazeSwipe, a multimodal interaction technique that combines eye gaze with finger-swipe gestures, enabling intuitive and low-friction reach on mobile touchscreens. Specifically, we design a gaze estimation method that eliminates the need for explicit gaze calibration. Our approach also avoids the use of additional eye-tracking hardware by leveraging the smartphone's built-in front-facing camera. Considering the potential decrease in gaze accuracy without dedicated eye trackers, we use finger-swipe gestures to compensate for any inaccuracies in gaze estimation. Additionally, we introduce a user-unaware auto-calibration method that improves gaze accuracy during interaction. Through extensive experiments on smartphones and tablets, we compare our technique with various methods for touchscreen reachability and evaluate the performance of our auto-calibration strategy. The results demonstrate that our method achieves high success rates and is preferred by users. The findings also validate the effectiveness of the auto-calibration strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21094v1-abstract-full').style.display = 'none'; document.getElementById('2503.21094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20822">arXiv:2503.20822</a> <span> [<a href="https://arxiv.org/pdf/2503.20822">pdf</a>, <a href="https://arxiv.org/format/2503.20822">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Synthetic Video Enhances Physical Fidelity in Video Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qi Zhao</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+X">Xingyu Ni</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyu Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+F">Feng Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyan Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lu Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bohan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20822v1-abstract-short" style="display: inline;"> We investigate how to enhance the physical fidelity of video generation models by leveraging synthetic videos derived from computer graphics pipelines. These rendered videos respect real-world physics, such as maintaining 3D consistency, and serve as a valuable resource that can potentially improve video generation models. To harness this potential, we propose a solution that curates and integrate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20822v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20822v1-abstract-full" style="display: none;"> We investigate how to enhance the physical fidelity of video generation models by leveraging synthetic videos derived from computer graphics pipelines. These rendered videos respect real-world physics, such as maintaining 3D consistency, and serve as a valuable resource that can potentially improve video generation models. To harness this potential, we propose a solution that curates and integrates synthetic data while introducing a method to transfer its physical realism to the model, significantly reducing unwanted artifacts. Through experiments on three representative tasks emphasizing physical consistency, we demonstrate its efficacy in enhancing physical fidelity. While our model still lacks a deep understanding of physics, our work offers one of the first empirical demonstrations that synthetic video enhances physical fidelity in video synthesis. Website: https://kevinz8866.github.io/simulation/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20822v1-abstract-full').style.display = 'none'; document.getElementById('2503.20822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20776">arXiv:2503.20776</a> <span> [<a href="https://arxiv.org/pdf/2503.20776">pdf</a>, <a href="https://arxiv.org/format/2503.20776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Feature4X: Bridging Any Monocular Video to 4D Agentic AI with Versatile Gaussian Feature Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shijie Zhou</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hui Ren</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yijia Weng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dejia Xu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Z">Zhiwen Fan</a>, <a href="/search/cs?searchtype=author&query=You%2C+S">Suya You</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Guibas%2C+L">Leonidas Guibas</a>, <a href="/search/cs?searchtype=author&query=Kadambi%2C+A">Achuta Kadambi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20776v1-abstract-short" style="display: inline;"> Recent advancements in 2D and multimodal models have achieved remarkable success by leveraging large-scale training on extensive datasets. However, extending these achievements to enable free-form interactions and high-level semantic operations with complex 3D/4D scenes remains challenging. This difficulty stems from the limited availability of large-scale, annotated 3D/4D or multi-view datasets,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20776v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20776v1-abstract-full" style="display: none;"> Recent advancements in 2D and multimodal models have achieved remarkable success by leveraging large-scale training on extensive datasets. However, extending these achievements to enable free-form interactions and high-level semantic operations with complex 3D/4D scenes remains challenging. This difficulty stems from the limited availability of large-scale, annotated 3D/4D or multi-view datasets, which are crucial for generalizable vision and language tasks such as open-vocabulary and prompt-based segmentation, language-guided editing, and visual question answering (VQA). In this paper, we introduce Feature4X, a universal framework designed to extend any functionality from 2D vision foundation model into the 4D realm, using only monocular video input, which is widely available from user-generated content. The "X" in Feature4X represents its versatility, enabling any task through adaptable, model-conditioned 4D feature field distillation. At the core of our framework is a dynamic optimization strategy that unifies multiple model capabilities into a single representation. Additionally, to the best of our knowledge, Feature4X is the first method to distill and lift the features of video foundation models (e.g. SAM2, InternVideo2) into an explicit 4D feature field using Gaussian Splatting. Our experiments showcase novel view segment anything, geometric and appearance scene editing, and free-form VQA across all time steps, empowered by LLMs in feedback loops. These advancements broaden the scope of agentic AI applications by providing a foundation for scalable, contextually and spatiotemporally aware systems capable of immersive dynamic 4D scene interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20776v1-abstract-full').style.display = 'none'; document.getElementById('2503.20776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20768">arXiv:2503.20768</a> <span> [<a href="https://arxiv.org/pdf/2503.20768">pdf</a>, <a href="https://arxiv.org/format/2503.20768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of the Impact of Federated Learning on Machine Learning Model Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haotian Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoran Wang</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+B">Benson Chou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sophie Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingxian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qizhen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20768v2-abstract-short" style="display: inline;"> Federated Learning (FL) enables distributed ML model training on private user data at the global scale. Despite the potential of FL demonstrated in many domains, an in-depth view of its impact on model accuracy remains unclear. In this paper, we investigate, systematically, how this learning paradigm can affect the accuracy of state-of-the-art ML models for a variety of ML tasks. We present an emp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20768v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20768v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20768v2-abstract-full" style="display: none;"> Federated Learning (FL) enables distributed ML model training on private user data at the global scale. Despite the potential of FL demonstrated in many domains, an in-depth view of its impact on model accuracy remains unclear. In this paper, we investigate, systematically, how this learning paradigm can affect the accuracy of state-of-the-art ML models for a variety of ML tasks. We present an empirical study that involves various data types: text, image, audio, and video, and FL configuration knobs: data distribution, FL scale, client sampling, and local and global computations. Our experiments are conducted in a unified FL framework to achieve high fidelity, with substantial human efforts and resource investments. Based on the results, we perform a quantitative analysis of the impact of FL, and highlight challenging scenarios where applying FL degrades the accuracy of the model drastically and identify cases where the impact is negligible. The detailed and extensive findings can benefit practical deployments and future development of FL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20768v2-abstract-full').style.display = 'none'; document.getElementById('2503.20768v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.2.4; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20752">arXiv:2503.20752</a> <span> [<a href="https://arxiv.org/pdf/2503.20752">pdf</a>, <a href="https://arxiv.org/format/2503.20752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reason-RFT: Reinforcement Fine-Tuning for Visual Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+H">Huajie Tan</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuheng Ji</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Minglan Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengwei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20752v2-abstract-short" style="display: inline;"> Visual reasoning abilities play a crucial role in understanding complex multimodal data, advancing both domain-specific applications and artificial general intelligence (AGI). Existing methods improve VLM reasoning via Chain-of-Thought (CoT) supervised fine-tuning, using meticulously annotated training data to enhance visual reasoning capabilities. However, this training paradigm may lead to overf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20752v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20752v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20752v2-abstract-full" style="display: none;"> Visual reasoning abilities play a crucial role in understanding complex multimodal data, advancing both domain-specific applications and artificial general intelligence (AGI). Existing methods improve VLM reasoning via Chain-of-Thought (CoT) supervised fine-tuning, using meticulously annotated training data to enhance visual reasoning capabilities. However, this training paradigm may lead to overfitting and cognitive rigidity, restricting the model's ability to transfer visual reasoning skills across domains and limiting its real-world applicability. To address these limitations, we propose Reason-RFT, a novel reinforcement fine-tuning framework that significantly enhances generalization capabilities in visual reasoning tasks. Reason-RFT introduces a two-phase training framework for visual reasoning: (1) Supervised Fine-Tuning (SFT) with curated Chain-of-Thought (CoT) data activates the reasoning potential of Vision-Language Models (VLMs), followed by (2) Group Relative Policy Optimization (GRPO)-based reinforcement learning that generates multiple reasoning-response pairs, significantly enhancing generalization in visual reasoning tasks. To evaluate Reason-RFT's visual reasoning capabilities, we reconstructed a comprehensive dataset spanning visual counting, structure perception, and spatial transformation. Experimental results demonstrate Reasoning-RFT's three key advantages: (1) Performance Enhancement: achieving state-of-the-art results across multiple tasks, outperforming most mainstream open-source and proprietary models; (2) Generalization Superiority: consistently maintaining robust performance across diverse tasks and domains, outperforming alternative training paradigms; (3) Data Efficiency: excelling in few-shot learning scenarios while surpassing full-dataset SFT baselines. Project website: https://tanhuajie.github.io/ReasonRFT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20752v2-abstract-full').style.display = 'none'; document.getElementById('2503.20752v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 22 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20701">arXiv:2503.20701</a> <span> [<a href="https://arxiv.org/pdf/2503.20701">pdf</a>, <a href="https://arxiv.org/format/2503.20701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> UniEDU: A Unified Language and Vision Assistant for Education Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhendong Chu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jian Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zichao Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20701v1-abstract-short" style="display: inline;"> Education materials for K-12 students often consist of multiple modalities, such as text and images, posing challenges for models to fully understand nuanced information in these materials. In this paper, we propose a unified language and vision assistant UniEDU designed for various educational applications, including knowledge recommendation, knowledge tracing, time cost prediction, and user answ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20701v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20701v1-abstract-full" style="display: none;"> Education materials for K-12 students often consist of multiple modalities, such as text and images, posing challenges for models to fully understand nuanced information in these materials. In this paper, we propose a unified language and vision assistant UniEDU designed for various educational applications, including knowledge recommendation, knowledge tracing, time cost prediction, and user answer prediction, all within a single model. Unlike conventional task-specific models, UniEDU offers a unified solution that excels across multiple educational tasks while maintaining strong generalization capabilities. Its adaptability makes it well-suited for real-world deployment in diverse learning environments. Furthermore, UniEDU is optimized for industry-scale deployment by significantly reducing computational overhead-achieving approximately a 300\% increase in efficiency-while maintaining competitive performance with minimal degradation compared to fully fine-tuned models. This work represents a significant step toward creating versatile AI systems tailored to the evolving demands of education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20701v1-abstract-full').style.display = 'none'; document.getElementById('2503.20701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20644">arXiv:2503.20644</a> <span> [<a href="https://arxiv.org/pdf/2503.20644">pdf</a>, <a href="https://arxiv.org/format/2503.20644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMGen: Unified Multi-modal Image Generation and Understanding in One Go </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiepeng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoqing Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hao Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dongdong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changhu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20644v1-abstract-short" style="display: inline;"> A unified diffusion framework for multi-modal generation and understanding has the transformative potential to achieve seamless and controllable image diffusion and other cross-modal tasks. In this paper, we introduce MMGen, a unified framework that integrates multiple generative tasks into a single diffusion model. This includes: (1) multi-modal category-conditioned generation, where multi-modal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20644v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20644v1-abstract-full" style="display: none;"> A unified diffusion framework for multi-modal generation and understanding has the transformative potential to achieve seamless and controllable image diffusion and other cross-modal tasks. In this paper, we introduce MMGen, a unified framework that integrates multiple generative tasks into a single diffusion model. This includes: (1) multi-modal category-conditioned generation, where multi-modal outputs are generated simultaneously through a single inference process, given category information; (2) multi-modal visual understanding, which accurately predicts depth, surface normals, and segmentation maps from RGB images; and (3) multi-modal conditioned generation, which produces corresponding RGB images based on specific modality conditions and other aligned modalities. Our approach develops a novel diffusion transformer that flexibly supports multi-modal output, along with a simple modality-decoupling strategy to unify various tasks. Extensive experiments and applications demonstrate the effectiveness and superiority of MMGen across diverse tasks and conditions, highlighting its potential for applications that require simultaneous generation and understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20644v1-abstract-full').style.display = 'none'; document.getElementById('2503.20644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our project page: https://jiepengwang.github.io/MMGen/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20537">arXiv:2503.20537</a> <span> [<a href="https://arxiv.org/pdf/2503.20537">pdf</a>, <a href="https://arxiv.org/format/2503.20537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TD-BFR: Truncated Diffusion Model for Efficient Blind Face Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziying Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiang Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhixin Wang</a>, <a href="/search/cs?searchtype=author&query=hu%2C+Q">Qiang hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20537v1-abstract-short" style="display: inline;"> Diffusion-based methodologies have shown significant potential in blind face restoration (BFR), leveraging their robust generative capabilities. However, they are often criticized for two significant problems: 1) slow training and inference speed, and 2) inadequate recovery of fine-grained facial details. To address these problems, we propose a novel Truncated Diffusion model for efficient Blind F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20537v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20537v1-abstract-full" style="display: none;"> Diffusion-based methodologies have shown significant potential in blind face restoration (BFR), leveraging their robust generative capabilities. However, they are often criticized for two significant problems: 1) slow training and inference speed, and 2) inadequate recovery of fine-grained facial details. To address these problems, we propose a novel Truncated Diffusion model for efficient Blind Face Restoration (TD-BFR), a three-stage paradigm tailored for the progressive resolution of degraded images. Specifically, TD-BFR utilizes an innovative truncated sampling method, starting from low-quality (LQ) images at low resolution to enhance sampling speed, and then introduces an adaptive degradation removal module to handle unknown degradations and connect the generation processes across different resolutions. Additionally, we further adapt the priors of pre-trained diffusion models to recover rich facial details. Our method efficiently restores high-quality images in a coarse-to-fine manner and experimental results demonstrate that TD-BFR is, on average, \textbf{4.75$\times$} faster than current state-of-the-art diffusion-based BFR methods while maintaining competitive quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20537v1-abstract-full').style.display = 'none'; document.getElementById('2503.20537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICME 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20322">arXiv:2503.20322</a> <span> [<a href="https://arxiv.org/pdf/2503.20322">pdf</a>, <a href="https://arxiv.org/format/2503.20322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Pyramid Network for Efficient Multimodal Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ai%2C+H">Hao Ai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kunyi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zezhou Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hao Lu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jin Tian</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yaxin Luo</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+P">Peng Xing</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jen-Yuan Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaxia Li</a>, <a href="/search/cs?searchtype=author&query=luo%2C+G">Gen luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20322v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in various vision-language (VL) tasks, but their expensive computations still limit the real-world application. To address this issue, recent efforts aim to compress the visual features to save the computational costs of MLLMs. However, direct visual compression methods, e.g. efficient projectors, inevitably destroy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20322v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20322v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have demonstrated impressive performance in various vision-language (VL) tasks, but their expensive computations still limit the real-world application. To address this issue, recent efforts aim to compress the visual features to save the computational costs of MLLMs. However, direct visual compression methods, e.g. efficient projectors, inevitably destroy the visual semantics in MLLM, especially in difficult samples. To overcome this shortcoming, we propose a novel dynamic pyramid network (DPN) for efficient MLLMs. Specifically, DPN formulates MLLM as a hierarchical structure where visual features are gradually compressed with increasing depth. In this case, even with a high compression ratio, fine-grained visual information can still be perceived in shallow layers. To maximize the benefit of DPN, we further propose an innovative Dynamic Pooling Experts (DPE) that can dynamically choose the optimal visual compression rate according to input features. With this design, harder samples will be assigned larger computations, thus preserving the model performance. To validate our approach, we conduct extensive experiments on two popular MLLMs and ten benchmarks. Experimental results show that DPN can save up to 56% average FLOPs on LLaVA while further achieving +0.74% performance gains. Besides, the generalization ability of DPN is also validated on the existing high-resolution MLLM called LLaVA-HR. Our source codes are anonymously released at https://github.com/aihao2000/DPN-LLaVA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20322v1-abstract-full').style.display = 'none'; document.getElementById('2503.20322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20309">arXiv:2503.20309</a> <span> [<a href="https://arxiv.org/pdf/2503.20309">pdf</a>, <a href="https://arxiv.org/format/2503.20309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Instruction-Oriented Preference Alignment for Enhancing Multi-Modal Comprehension Capability of MLLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zitian Wang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yue Liao</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+K">Kang Rong</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+F">Fengyun Rao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yibo Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Si Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20309v1-abstract-short" style="display: inline;"> Preference alignment has emerged as an effective strategy to enhance the performance of Multimodal Large Language Models (MLLMs) following supervised fine-tuning. While existing preference alignment methods predominantly target hallucination factors, they overlook the factors essential for multi-modal comprehension capabilities, often narrowing their improvements on hallucination mitigation. To br… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20309v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20309v1-abstract-full" style="display: none;"> Preference alignment has emerged as an effective strategy to enhance the performance of Multimodal Large Language Models (MLLMs) following supervised fine-tuning. While existing preference alignment methods predominantly target hallucination factors, they overlook the factors essential for multi-modal comprehension capabilities, often narrowing their improvements on hallucination mitigation. To bridge this gap, we propose Instruction-oriented Preference Alignment (IPA), a scalable framework designed to automatically construct alignment preferences grounded in instruction fulfillment efficacy. Our method involves an automated preference construction coupled with a dedicated verification process that identifies instruction-oriented factors, avoiding significant variability in response representations. Additionally, IPA incorporates a progressive preference collection pipeline, further recalling challenging samples through model self-evolution and reference-guided refinement. Experiments conducted on Qwen2VL-7B demonstrate IPA's effectiveness across multiple benchmarks, including hallucination evaluation, visual question answering, and text understanding tasks, highlighting its capability to enhance general comprehension. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20309v1-abstract-full').style.display = 'none'; document.getElementById('2503.20309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20195">arXiv:2503.20195</a> <span> [<a href="https://arxiv.org/pdf/2503.20195">pdf</a>, <a href="https://arxiv.org/format/2503.20195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Mutual Information-Empowered Task-Oriented Communication: Principles, Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongru Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Songjie Xie</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jiawei Shao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hengtao He</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shenghui Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/cs?searchtype=author&query=Letaief%2C+K+B">Khaled B. Letaief</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20195v1-abstract-short" style="display: inline;"> Mutual information (MI)-based guidelines have recently proven to be effective for designing task-oriented communication systems, where the ultimate goal is to extract and transmit task-relevant information for downstream task. This paper provides a comprehensive overview of MI-empowered task-oriented communication, highlighting how MI-based methods can serve as a unifying design framework in vario… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20195v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20195v1-abstract-full" style="display: none;"> Mutual information (MI)-based guidelines have recently proven to be effective for designing task-oriented communication systems, where the ultimate goal is to extract and transmit task-relevant information for downstream task. This paper provides a comprehensive overview of MI-empowered task-oriented communication, highlighting how MI-based methods can serve as a unifying design framework in various task-oriented communication scenarios. We begin with the roadmap of MI for designing task-oriented communication systems, and then introduce the roles and applications of MI to guide feature encoding, transmission optimization, and efficient training with two case studies. We further elaborate the limitations and challenges of MI-based methods. Finally, we identify several open issues in MI-based task-oriented communication to inspire future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20195v1-abstract-full').style.display = 'none'; document.getElementById('2503.20195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,5 figures, submitted to IEEE for potential publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20179">arXiv:2503.20179</a> <span> [<a href="https://arxiv.org/pdf/2503.20179">pdf</a>, <a href="https://arxiv.org/format/2503.20179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> ProtoBERT-LoRA: Parameter-Efficient Prototypical Finetuning for Immunotherapy Study Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shijia Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiyu Ding</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kai Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jacob Zhang</a>, <a href="/search/cs?searchtype=author&query=Galinsky%2C+K">Kevin Galinsky</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengrui Wang</a>, <a href="/search/cs?searchtype=author&query=Mayers%2C+R+P">Ryan P. Mayers</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheyu Wang</a>, <a href="/search/cs?searchtype=author&query=Kharrazi%2C+H">Hadi Kharrazi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20179v1-abstract-short" style="display: inline;"> Identifying immune checkpoint inhibitor (ICI) studies in genomic repositories like Gene Expression Omnibus (GEO) is vital for cancer research yet remains challenging due to semantic ambiguity, extreme class imbalance, and limited labeled data in low-resource settings. We present ProtoBERT-LoRA, a hybrid framework that combines PubMedBERT with prototypical networks and Low-Rank Adaptation (LoRA) fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20179v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20179v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20179v1-abstract-full" style="display: none;"> Identifying immune checkpoint inhibitor (ICI) studies in genomic repositories like Gene Expression Omnibus (GEO) is vital for cancer research yet remains challenging due to semantic ambiguity, extreme class imbalance, and limited labeled data in low-resource settings. We present ProtoBERT-LoRA, a hybrid framework that combines PubMedBERT with prototypical networks and Low-Rank Adaptation (LoRA) for efficient fine-tuning. The model enforces class-separable embeddings via episodic prototype training while preserving biomedical domain knowledge. Our dataset was divided as: Training (20 positive, 20 negative), Prototype Set (10 positive, 10 negative), Validation (20 positive, 200 negative), and Test (71 positive, 765 negative). Evaluated on test dataset, ProtoBERT-LoRA achieved F1-score of 0.624 (precision: 0.481, recall: 0.887), outperforming the rule-based system, machine learning baselines and finetuned PubMedBERT. Application to 44,287 unlabeled studies reduced manual review efforts by 82%. Ablation studies confirmed that combining prototypes with LoRA improved performance by 29% over stand-alone LoRA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20179v1-abstract-full').style.display = 'none'; document.getElementById('2503.20179v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to AMIA 2025 Annual Symposium</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20113">arXiv:2503.20113</a> <span> [<a href="https://arxiv.org/pdf/2503.20113">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domain Adaptation Framework for Turning Movement Count Estimation with Limited Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaobo Ma</a>, <a href="/search/cs?searchtype=author&query=Noh%2C+H">Hyunsoo Noh</a>, <a href="/search/cs?searchtype=author&query=Hatch%2C+R">Ryan Hatch</a>, <a href="/search/cs?searchtype=author&query=Tokishi%2C+J">James Tokishi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zepu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20113v1-abstract-short" style="display: inline;"> Urban transportation networks are vital for the efficient movement of people and goods, necessitating effective traffic management and planning. An integral part of traffic management is understanding the turning movement counts (TMCs) at intersections, Accurate TMCs at intersections are crucial for traffic signal control, congestion mitigation, and road safety. In general, TMCs are obtained using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20113v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20113v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20113v1-abstract-full" style="display: none;"> Urban transportation networks are vital for the efficient movement of people and goods, necessitating effective traffic management and planning. An integral part of traffic management is understanding the turning movement counts (TMCs) at intersections, Accurate TMCs at intersections are crucial for traffic signal control, congestion mitigation, and road safety. In general, TMCs are obtained using physical sensors installed at intersections, but this approach can be cost-prohibitive and technically challenging, especially for cities with extensive road networks. Recent advancements in machine learning and data-driven approaches have offered promising alternatives for estimating TMCs. Traffic patterns can vary significantly across different intersections due to factors such as road geometry, traffic signal settings, and local driver behaviors. This domain discrepancy limits the generalizability and accuracy of machine learning models when applied to new or unseen intersections. In response to these limitations, this research proposes a novel framework leveraging domain adaptation (DA) to estimate TMCs at intersections by using traffic controller event-based data, road infrastructure data, and point-of-interest (POI) data. Evaluated on 30 intersections in Tucson, Arizona, the performance of the proposed DA framework was compared with state-of-the-art models and achieved the lowest values in terms of Mean Absolute Error and Root Mean Square Error. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20113v1-abstract-full').style.display = 'none'; document.getElementById('2503.20113v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2412.09861</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19859">arXiv:2503.19859</a> <span> [<a href="https://arxiv.org/pdf/2503.19859">pdf</a>, <a href="https://arxiv.org/format/2503.19859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> An Overview of Low-Rank Structures in the Training and Adaptation of Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Balzano%2C+L">Laura Balzano</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianjiao Ding</a>, <a href="/search/cs?searchtype=author&query=Haeffele%2C+B+D">Benjamin D. Haeffele</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+S+M">Soo Min Kwon</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qing Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yaras%2C+C">Can Yaras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19859v1-abstract-short" style="display: inline;"> The rise of deep learning has revolutionized data processing and prediction in signal processing and machine learning, yet the substantial computational demands of training and deploying modern large-scale deep models present significant challenges, including high computational costs and energy consumption. Recent research has uncovered a widespread phenomenon in deep networks: the emergence of lo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19859v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19859v1-abstract-full" style="display: none;"> The rise of deep learning has revolutionized data processing and prediction in signal processing and machine learning, yet the substantial computational demands of training and deploying modern large-scale deep models present significant challenges, including high computational costs and energy consumption. Recent research has uncovered a widespread phenomenon in deep networks: the emergence of low-rank structures in weight matrices and learned representations during training. These implicit low-dimensional patterns provide valuable insights for improving the efficiency of training and fine-tuning large-scale models. Practical techniques inspired by this phenomenon, such as low-rank adaptation (LoRA) and training, enable significant reductions in computational cost while preserving model performance. In this paper, we present a comprehensive review of recent advances in exploiting low-rank structures for deep learning and shed light on their mathematical foundations. Mathematically, we present two complementary perspectives on understanding the low-rankness in deep networks: (i) the emergence of low-rank structures throughout the whole optimization dynamics of gradient and (ii) the implicit regularization effects that induce such low-rank structures at convergence. From a practical standpoint, studying the low-rank learning dynamics of gradient descent offers a mathematical foundation for understanding the effectiveness of LoRA in fine-tuning large-scale models and inspires parameter-efficient low-rank training strategies. Furthermore, the implicit low-rank regularization effect helps explain the success of various masked training approaches in deep neural networks, ranging from dropout to masked self-supervised learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19859v1-abstract-full').style.display = 'none'; document.getElementById('2503.19859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Authors are listed alphabetically; 27 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19502">arXiv:2503.19502</a> <span> [<a href="https://arxiv.org/pdf/2503.19502">pdf</a>, <a href="https://arxiv.org/format/2503.19502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Long-Range ENSO Prediction with an Explainable Deep Learning Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yinghao Cui</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+G">Guobin Hong</a>, <a href="/search/cs?searchtype=author&query=Ashok%2C+K">Karumuri Ashok</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yuchun Pu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaogu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanze Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+W">Wei Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+P">Peng Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhonglei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19502v1-abstract-short" style="display: inline;"> El Ni帽o-Southern Oscillation (ENSO) is a prominent mode of interannual climate variability with far-reaching global impacts. Its evolution is governed by intricate air-sea interactions, posing significant challenges for long-term prediction. In this study, we introduce CTEFNet, a multivariate deep learning model that synergizes convolutional neural networks and transformers to enhance ENSO forecas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19502v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19502v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19502v1-abstract-full" style="display: none;"> El Ni帽o-Southern Oscillation (ENSO) is a prominent mode of interannual climate variability with far-reaching global impacts. Its evolution is governed by intricate air-sea interactions, posing significant challenges for long-term prediction. In this study, we introduce CTEFNet, a multivariate deep learning model that synergizes convolutional neural networks and transformers to enhance ENSO forecasting. By integrating multiple oceanic and atmospheric predictors, CTEFNet extends the effective forecast lead time to 20 months while mitigating the impact of the spring predictability barrier, outperforming both dynamical models and state-of-the-art deep learning approaches. Furthermore, CTEFNet offers physically meaningful and statistically significant insights through gradient-based sensitivity analysis, revealing the key precursor signals that govern ENSO dynamics, which align with well-established theories and reveal new insights about inter-basin interactions among the Pacific, Atlantic, and Indian Oceans. The CTEFNet's superior predictive skill and interpretable sensitivity assessments underscore its potential for advancing climate prediction. Our findings highlight the importance of multivariate coupling in ENSO evolution and demonstrate the promise of deep learning in capturing complex climate dynamics with enhanced interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19502v1-abstract-full').style.display = 'none'; document.getElementById('2503.19502v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19482">arXiv:2503.19482</a> <span> [<a href="https://arxiv.org/pdf/2503.19482">pdf</a>, <a href="https://arxiv.org/format/2503.19482">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KSHSeek: Data-Driven Approaches to Mitigating and Detecting Knowledge-Shortcut Hallucinations in Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongyu Sun</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Meng Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19482v1-abstract-short" style="display: inline;"> The emergence of large language models (LLMs) has significantly advanced the development of natural language processing (NLP), especially in text generation tasks like question answering. However, model hallucinations remain a major challenge in natural language generation (NLG) tasks due to their complex causes. We systematically expand on the causes of factual hallucinations from the perspective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19482v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19482v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19482v1-abstract-full" style="display: none;"> The emergence of large language models (LLMs) has significantly advanced the development of natural language processing (NLP), especially in text generation tasks like question answering. However, model hallucinations remain a major challenge in natural language generation (NLG) tasks due to their complex causes. We systematically expand on the causes of factual hallucinations from the perspective of knowledge shortcuts, analyzing hallucinations arising from correct and defect-free data and demonstrating that knowledge-shortcut hallucinations are prevalent in generative models. To mitigate this issue, we propose a high similarity pruning algorithm at the data preprocessing level to reduce spurious correlations in the data. Additionally, we design a specific detection method for knowledge-shortcut hallucinations to evaluate the effectiveness of our mitigation strategy. Experimental results show that our approach effectively reduces knowledge-shortcut hallucinations, particularly in fine-tuning tasks, without negatively impacting model performance in question answering. This work introduces a new paradigm for mitigating specific hallucination issues in generative models, enhancing their robustness and reliability in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19482v1-abstract-full').style.display = 'none'; document.getElementById('2503.19482v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 34 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19476">arXiv:2503.19476</a> <span> [<a href="https://arxiv.org/pdf/2503.19476">pdf</a>, <a href="https://arxiv.org/format/2503.19476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Extracting Interpretable Logic Rules from Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geng%2C+C">Chuqin Geng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaoyue Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Haolin Ye</a>, <a href="/search/cs?searchtype=author&query=Si%2C+X">Xujie Si</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19476v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) operate over both input feature spaces and combinatorial graph structures, making it challenging to understand the rationale behind their predictions. As GNNs gain widespread popularity and demonstrate success across various domains, such as drug discovery, studying their interpretability has become a critical task. To address this, many explainability methods have bee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19476v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19476v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) operate over both input feature spaces and combinatorial graph structures, making it challenging to understand the rationale behind their predictions. As GNNs gain widespread popularity and demonstrate success across various domains, such as drug discovery, studying their interpretability has become a critical task. To address this, many explainability methods have been proposed, with recent efforts shifting from instance-specific explanations to global concept-based explainability. However, these approaches face several limitations, such as relying on predefined concepts and explaining only a limited set of patterns. To address this, we propose a novel framework, LOGICXGNN, for extracting interpretable logic rules from GNNs. LOGICXGNN is model-agnostic, efficient, and data-driven, eliminating the need for predefined concepts. More importantly, it can serve as a rule-based classifier and even outperform the original neural models. Its interpretability facilitates knowledge discovery, as demonstrated by its ability to extract detailed and accurate chemistry knowledge that is often overlooked by existing methods. Another key advantage of LOGICXGNN is its ability to generate new graph instances in a controlled and transparent manner, offering significant potential for applications such as drug design. We empirically demonstrate these merits through experiments on real-world datasets such as MUTAG and BBBP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19476v1-abstract-full').style.display = 'none'; document.getElementById('2503.19476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19463">arXiv:2503.19463</a> <span> [<a href="https://arxiv.org/pdf/2503.19463">pdf</a>, <a href="https://arxiv.org/ps/2503.19463">ps</a>, <a href="https://arxiv.org/format/2503.19463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> The $g$-good-neighbor diagnosability of product networks under the PMC model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhao Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yaping Mao</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+S">Sun-Yuan Hsieh</a>, <a href="/search/cs?searchtype=author&query=Klasing%2C+R">Ralf Klasing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19463v1-abstract-short" style="display: inline;"> The concept of neighbor connectivity originated from the assessment of the subversion of espionage networks caused by underground resistance movements, and it has now been applied to measure the disruption of networks caused by cascading failures through neighbors. In this paper, we give two necessary and sufficient conditions of the existance of $g$-good-neighbor diagnosability. We introduce a ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19463v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19463v1-abstract-full" style="display: none;"> The concept of neighbor connectivity originated from the assessment of the subversion of espionage networks caused by underground resistance movements, and it has now been applied to measure the disruption of networks caused by cascading failures through neighbors. In this paper, we give two necessary and sufficient conditions of the existance of $g$-good-neighbor diagnosability. We introduce a new concept called $g$-good neighbor cut-component number (gc number for short), which has close relation with $g$-good-neighbor diagnosability. Sharp lower and upper bounds of the gc number of general graphs in terms of the $g$-good neighbor connectivity is given, which provides a formula to compute the $g$-good-neighbor diagnosability for general graphs (therefore for Cartesian product graphs). As their applications, we get the exact values or bounds for the gc numbers and $g$-good-neighbor diagnosability of grid, torus networks and generalized cubes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19463v1-abstract-full').style.display = 'none'; document.getElementById('2503.19463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages; 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19407">arXiv:2503.19407</a> <span> [<a href="https://arxiv.org/pdf/2503.19407">pdf</a>, <a href="https://arxiv.org/format/2503.19407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Prototype-Guided Coarse Annotations Refining Approach for Whole Slide Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bingjian Yao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weiping Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yan He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liangsheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19407v1-abstract-short" style="display: inline;"> The fine-grained annotations in whole slide images (WSIs) show the boundaries of various pathological regions. However, generating such detailed annotation is often costly, whereas the coarse annotations are relatively simpler to produce. Existing methods for refining coarse annotations often rely on extensive training samples or clean datasets, and fail to capture both intra-slide and inter-slide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19407v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19407v1-abstract-full" style="display: none;"> The fine-grained annotations in whole slide images (WSIs) show the boundaries of various pathological regions. However, generating such detailed annotation is often costly, whereas the coarse annotations are relatively simpler to produce. Existing methods for refining coarse annotations often rely on extensive training samples or clean datasets, and fail to capture both intra-slide and inter-slide latent sematic patterns, limiting their precision. In this paper, we propose a prototype-guided approach. Specifically, we introduce a local-to-global approach to construct non-redundant representative prototypes by jointly modeling intra-slide local semantics and inter-slide contextual relationships. Then a prototype-guided pseudo-labeling module is proposed for refining coarse annotations. Finally, we employ dynamic data sampling and re-finetuning strategy to train a patch classifier. Extensive experiments on three publicly available WSI datasets, covering lymph, liver, and colorectal cancers, demonstrate that our method significantly outperforms existing state-of-the-art (SOTA) methods. The code will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19407v1-abstract-full').style.display = 'none'; document.getElementById('2503.19407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19207">arXiv:2503.19207</a> <span> [<a href="https://arxiv.org/pdf/2503.19207">pdf</a>, <a href="https://arxiv.org/format/2503.19207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FRESA:Feedforward Reconstruction of Personalized Skinned Avatars from Few Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rong Wang</a>, <a href="/search/cs?searchtype=author&query=Prada%2C+F">Fabian Prada</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongshi Jiang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+C">Chengxiang Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junxuan Li</a>, <a href="/search/cs?searchtype=author&query=Saito%2C+S">Shunsuke Saito</a>, <a href="/search/cs?searchtype=author&query=Santesteban%2C+I">Igor Santesteban</a>, <a href="/search/cs?searchtype=author&query=Romero%2C+J">Javier Romero</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+R">Rohan Joshi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Saragih%2C+J">Jason Saragih</a>, <a href="/search/cs?searchtype=author&query=Sheikh%2C+Y">Yaser Sheikh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19207v1-abstract-short" style="display: inline;"> We present a novel method for reconstructing personalized 3D human avatars with realistic animation from only a few images. Due to the large variations in body shapes, poses, and cloth types, existing methods mostly require hours of per-subject optimization during inference, which limits their practical applications. In contrast, we learn a universal prior from over a thousand clothed humans to ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19207v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19207v1-abstract-full" style="display: none;"> We present a novel method for reconstructing personalized 3D human avatars with realistic animation from only a few images. Due to the large variations in body shapes, poses, and cloth types, existing methods mostly require hours of per-subject optimization during inference, which limits their practical applications. In contrast, we learn a universal prior from over a thousand clothed humans to achieve instant feedforward generation and zero-shot generalization. Specifically, instead of rigging the avatar with shared skinning weights, we jointly infer personalized avatar shape, skinning weights, and pose-dependent deformations, which effectively improves overall geometric fidelity and reduces deformation artifacts. Moreover, to normalize pose variations and resolve coupled ambiguity between canonical shapes and skinning weights, we design a 3D canonicalization process to produce pixel-aligned initial conditions, which helps to reconstruct fine-grained geometric details. We then propose a multi-frame feature aggregation to robustly reduce artifacts introduced in canonicalization and fuse a plausible avatar preserving person-specific identities. Finally, we train the model in an end-to-end framework on a large-scale capture dataset, which contains diverse human subjects paired with high-quality 3D scans. Extensive experiments show that our method generates more authentic reconstruction and animation than state-of-the-arts, and can be directly generalized to inputs from casually taken phone photos. Project page and code is available at https://github.com/rongakowang/FRESA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19207v1-abstract-full').style.display = 'none'; document.getElementById('2503.19207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18968">arXiv:2503.18968</a> <span> [<a href="https://arxiv.org/pdf/2503.18968">pdf</a>, <a href="https://arxiv.org/format/2503.18968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MedAgent-Pro: Towards Multi-modal Evidence-based Medical Diagnosis via Reasoning Agentic Workflow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyue Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junde Wu</a>, <a href="/search/cs?searchtype=author&query=Low%2C+C+H">Chang Han Low</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yueming Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18968v1-abstract-short" style="display: inline;"> Developing reliable AI systems to assist human clinicians in multi-modal medical diagnosis has long been a key objective for researchers. Recently, Multi-modal Large Language Models (MLLMs) have gained significant attention and achieved success across various domains. With strong reasoning capabilities and the ability to perform diverse tasks based on user instructions, they hold great potential f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18968v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18968v1-abstract-full" style="display: none;"> Developing reliable AI systems to assist human clinicians in multi-modal medical diagnosis has long been a key objective for researchers. Recently, Multi-modal Large Language Models (MLLMs) have gained significant attention and achieved success across various domains. With strong reasoning capabilities and the ability to perform diverse tasks based on user instructions, they hold great potential for enhancing medical diagnosis. However, directly applying MLLMs to the medical domain still presents challenges. They lack detailed perception of visual inputs, limiting their ability to perform quantitative image analysis, which is crucial for medical diagnostics. Additionally, MLLMs often exhibit hallucinations and inconsistencies in reasoning, whereas clinical diagnoses must adhere strictly to established criteria. To address these challenges, we propose MedAgent-Pro, an evidence-based reasoning agentic system designed to achieve reliable, explainable, and precise medical diagnoses. This is accomplished through a hierarchical workflow: at the task level, knowledge-based reasoning generate reliable diagnostic plans for specific diseases following retrieved clinical criteria. While at the case level, multiple tool agents process multi-modal inputs, analyze different indicators according to the plan, and provide a final diagnosis based on both quantitative and qualitative evidence. Comprehensive experiments on both 2D and 3D medical diagnosis tasks demonstrate the superiority and effectiveness of MedAgent-Pro, while case studies further highlight its reliability and interpretability. The code is available at https://github.com/jinlab-imvr/MedAgent-Pro. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18968v1-abstract-full').style.display = 'none'; document.getElementById('2503.18968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18891">arXiv:2503.18891</a> <span> [<a href="https://arxiv.org/pdf/2503.18891">pdf</a>, <a href="https://arxiv.org/format/2503.18891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AgentDropout: Dynamic Agent Elimination for Token-Efficient and High-Performance LLM-Based Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhexuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yutong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Liang Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18891v1-abstract-short" style="display: inline;"> Multi-agent systems (MAS) based on large language models (LLMs) have demonstrated significant potential in collaborative problem-solving. However, they still face substantial challenges of low communication efficiency and suboptimal task performance, making the careful design of the agents' communication topologies particularly important. Inspired by the management theory that roles in an efficien… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18891v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18891v1-abstract-full" style="display: none;"> Multi-agent systems (MAS) based on large language models (LLMs) have demonstrated significant potential in collaborative problem-solving. However, they still face substantial challenges of low communication efficiency and suboptimal task performance, making the careful design of the agents' communication topologies particularly important. Inspired by the management theory that roles in an efficient team are often dynamically adjusted, we propose AgentDropout, which identifies redundant agents and communication across different communication rounds by optimizing the adjacency matrices of the communication graphs and eliminates them to enhance both token efficiency and task performance. Compared to state-of-the-art methods, AgentDropout achieves an average reduction of 21.6% in prompt token consumption and 18.4% in completion token consumption, along with a performance improvement of 1.14 on the tasks. Furthermore, the extended experiments demonstrate that AgentDropout achieves notable domain transferability and structure robustness, revealing its reliability and effectiveness. We release our code at https://github.com/wangzx1219/AgentDropout. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18891v1-abstract-full').style.display = 'none'; document.getElementById('2503.18891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18808">arXiv:2503.18808</a> <span> [<a href="https://arxiv.org/pdf/2503.18808">pdf</a>, <a href="https://arxiv.org/format/2503.18808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CRCL: Causal Representation Consistency Learning for Anomaly Detection in Surveillance Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongjin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zepu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaoguang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jing Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peng Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Rui Tang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jianwei Du</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Liang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18808v1-abstract-short" style="display: inline;"> Video Anomaly Detection (VAD) remains a fundamental yet formidable task in the video understanding community, with promising applications in areas such as information forensics and public safety protection. Due to the rarity and diversity of anomalies, existing methods only use easily collected regular events to model the inherent normality of normal spatial-temporal patterns in an unsupervised ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18808v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18808v1-abstract-full" style="display: none;"> Video Anomaly Detection (VAD) remains a fundamental yet formidable task in the video understanding community, with promising applications in areas such as information forensics and public safety protection. Due to the rarity and diversity of anomalies, existing methods only use easily collected regular events to model the inherent normality of normal spatial-temporal patterns in an unsupervised manner. Previous studies have shown that existing unsupervised VAD models are incapable of label-independent data offsets (e.g., scene changes) in real-world scenarios and may fail to respond to light anomalies due to the overgeneralization of deep neural networks. Inspired by causality learning, we argue that there exist causal factors that can adequately generalize the prototypical patterns of regular events and present significant deviations when anomalous instances occur. In this regard, we propose Causal Representation Consistency Learning (CRCL) to implicitly mine potential scene-robust causal variable in unsupervised video normality learning. Specifically, building on the structural causal models, we propose scene-debiasing learning and causality-inspired normality learning to strip away entangled scene bias in deep representations and learn causal video normality, respectively. Extensive experiments on benchmarks validate the superiority of our method over conventional deep representation learning. Moreover, ablation studies and extension validation show that the CRCL can cope with label-independent biases in multi-scene settings and maintain stable performance with only limited training data available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18808v1-abstract-full').style.display = 'none'; document.getElementById('2503.18808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication by IEEE Transactions on Image Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18802">arXiv:2503.18802</a> <span> [<a href="https://arxiv.org/pdf/2503.18802">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.5334/tismir.194">10.5334/tismir.194 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Monan Zhou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shenyang Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaorui Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhaowen Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Baoqiang Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18802v1-abstract-short" style="display: inline;"> Data are crucial in various computer-related fields, including music information retrieval (MIR), an interdisciplinary area bridging computer science and music. This paper introduces CCMusic, an open and diverse database comprising multiple datasets specifically designed for tasks related to Chinese music, highlighting our focus on this culturally rich domain. The database integrates both publishe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18802v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18802v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18802v1-abstract-full" style="display: none;"> Data are crucial in various computer-related fields, including music information retrieval (MIR), an interdisciplinary area bridging computer science and music. This paper introduces CCMusic, an open and diverse database comprising multiple datasets specifically designed for tasks related to Chinese music, highlighting our focus on this culturally rich domain. The database integrates both published and unpublished datasets, with steps taken such as data cleaning, label refinement, and data structure unification to ensure data consistency and create ready-to-use versions. We conduct benchmark evaluations for all datasets using a unified evaluation framework developed specifically for this purpose. This publicly available framework supports both classification and detection tasks, ensuring standardized and reproducible results across all datasets. The database is hosted on HuggingFace and ModelScope, two open and multifunctional data and model hosting platforms, ensuring ease of accessibility and usability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18802v1-abstract-full').style.display = 'none'; document.getElementById('2503.18802v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 18 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Transactions of the International Society for Music Information Retrieval, 2025, 8(1), 22-38 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18536">arXiv:2503.18536</a> <span> [<a href="https://arxiv.org/pdf/2503.18536">pdf</a>, <a href="https://arxiv.org/format/2503.18536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiN: Diffusion Model for Robust Medical VQA with Semantic Noisy Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+E">Erjian Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18536v1-abstract-short" style="display: inline;"> Medical Visual Question Answering (Med-VQA) systems benefit the interpretation of medical images containing critical clinical information. However, the challenge of noisy labels and limited high-quality datasets remains underexplored. To address this, we establish the first benchmark for noisy labels in Med-VQA by simulating human mislabeling with semantically designed noise types. More importantl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18536v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18536v1-abstract-full" style="display: none;"> Medical Visual Question Answering (Med-VQA) systems benefit the interpretation of medical images containing critical clinical information. However, the challenge of noisy labels and limited high-quality datasets remains underexplored. To address this, we establish the first benchmark for noisy labels in Med-VQA by simulating human mislabeling with semantically designed noise types. More importantly, we introduce the DiN framework, which leverages a diffusion model to handle noisy labels in Med-VQA. Unlike the dominant classification-based VQA approaches that directly predict answers, our Answer Diffuser (AD) module employs a coarse-to-fine process, refining answer candidates with a diffusion model for improved accuracy. The Answer Condition Generator (ACG) further enhances this process by generating task-specific conditional information via integrating answer embeddings with fused image-question features. To address label noise, our Noisy Label Refinement(NLR) module introduces a robust loss function and dynamic answer adjustment to further boost the performance of the AD module. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18536v1-abstract-full').style.display = 'none'; document.getElementById('2503.18536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18483">arXiv:2503.18483</a> <span> [<a href="https://arxiv.org/pdf/2503.18483">pdf</a>, <a href="https://arxiv.org/format/2503.18483">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Explaining Domain Shifts in Language: Concept erasing for Interpretable Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zequn Zeng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yudi Su</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianqiao Sun</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+T">Tiansheng Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengjue Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongwei Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiawei Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18483v1-abstract-short" style="display: inline;"> Concept-based models can map black-box representations to human-understandable concepts, which makes the decision-making process more transparent and then allows users to understand the reason behind predictions. However, domain-specific concepts often impact the final predictions, which subsequently undermine the model generalization capabilities, and prevent the model from being used in high-sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18483v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18483v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18483v1-abstract-full" style="display: none;"> Concept-based models can map black-box representations to human-understandable concepts, which makes the decision-making process more transparent and then allows users to understand the reason behind predictions. However, domain-specific concepts often impact the final predictions, which subsequently undermine the model generalization capabilities, and prevent the model from being used in high-stake applications. In this paper, we propose a novel Language-guided Concept-Erasing (LanCE) framework. In particular, we empirically demonstrate that pre-trained vision-language models (VLMs) can approximate distinct visual domain shifts via domain descriptors while prompting large Language Models (LLMs) can easily simulate a wide range of descriptors of unseen visual domains. Then, we introduce a novel plug-in domain descriptor orthogonality (DDO) regularizer to mitigate the impact of these domain-specific concepts on the final predictions. Notably, the DDO regularizer is agnostic to the design of concept-based models and we integrate it into several prevailing models. Through evaluation of domain generalization on four standard benchmarks and three newly introduced benchmarks, we demonstrate that DDO can significantly improve the out-of-distribution (OOD) generalization over the previous state-of-the-art concept-based models.Our code is available at https://github.com/joeyz0z/LanCE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18483v1-abstract-full').style.display = 'none'; document.getElementById('2503.18483v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18445">arXiv:2503.18445</a> <span> [<a href="https://arxiv.org/pdf/2503.18445">pdf</a>, <a href="https://arxiv.org/format/2503.18445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Multi-modal Semantic Segmentation under Sensor Failures: Missing and Noisy Modality Robustness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liao%2C+C">Chenfei Liao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+K">Kaiyu Lei</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xu Zheng</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+J">Junha Moon</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhixiong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Paudel%2C+D+P">Danda Pani Paudel</a>, <a href="/search/cs?searchtype=author&query=Van+Gool%2C+L">Luc Van Gool</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xuming Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18445v1-abstract-short" style="display: inline;"> Multi-modal semantic segmentation (MMSS) addresses the limitations of single-modality data by integrating complementary information across modalities. Despite notable progress, a significant gap persists between research and real-world deployment due to variability and uncertainty in multi-modal data quality. Robustness has thus become essential for practical MMSS applications. However, the absenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18445v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18445v1-abstract-full" style="display: none;"> Multi-modal semantic segmentation (MMSS) addresses the limitations of single-modality data by integrating complementary information across modalities. Despite notable progress, a significant gap persists between research and real-world deployment due to variability and uncertainty in multi-modal data quality. Robustness has thus become essential for practical MMSS applications. However, the absence of standardized benchmarks for evaluating robustness hinders further advancement. To address this, we first survey existing MMSS literature and categorize representative methods to provide a structured overview. We then introduce a robustness benchmark that evaluates MMSS models under three scenarios: Entire-Missing Modality (EMM), Random-Missing Modality (RMM), and Noisy Modality (NM). From a probabilistic standpoint, we model modality failure under two conditions: (1) all damaged combinations are equally probable; (2) each modality fails independently following a Bernoulli distribution. Based on these, we propose four metrics-$mIoU^{Avg}_{EMM}$, $mIoU^{E}_{EMM}$, $mIoU^{Avg}_{RMM}$, and $mIoU^{E}_{RMM}$-to assess model robustness under EMM and RMM. This work provides the first dedicated benchmark for MMSS robustness, offering new insights and tools to advance the field. Source code is available at https://github.com/Chenfei-Liao/Multi-Modal-Semantic-Segmentation-Robustness-Benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18445v1-abstract-full').style.display = 'none'; document.getElementById('2503.18445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18286">arXiv:2503.18286</a> <span> [<a href="https://arxiv.org/pdf/2503.18286">pdf</a>, <a href="https://arxiv.org/format/2503.18286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CO-SPY: Combining Semantic and Pixel Features to Detect Synthetic Images by AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Siyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+L">Lingjuan Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Sehwag%2C+V">Vikash Sehwag</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18286v1-abstract-short" style="display: inline;"> With the rapid advancement of generative AI, it is now possible to synthesize high-quality images in a few seconds. Despite the power of these technologies, they raise significant concerns regarding misuse. Current efforts to distinguish between real and AI-generated images may lack generalization, being effective for only certain types of generative models and susceptible to post-processing techn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18286v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18286v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18286v1-abstract-full" style="display: none;"> With the rapid advancement of generative AI, it is now possible to synthesize high-quality images in a few seconds. Despite the power of these technologies, they raise significant concerns regarding misuse. Current efforts to distinguish between real and AI-generated images may lack generalization, being effective for only certain types of generative models and susceptible to post-processing techniques like JPEG compression. To overcome these limitations, we propose a novel framework, Co-Spy, that first enhances existing semantic features (e.g., the number of fingers in a hand) and artifact features (e.g., pixel value differences), and then adaptively integrates them to achieve more general and robust synthetic image detection. Additionally, we create Co-Spy-Bench, a comprehensive dataset comprising 5 real image datasets and 22 state-of-the-art generative models, including the latest models like FLUX. We also collect 50k synthetic images in the wild from the Internet to enable evaluation in a more practical setting. Our extensive evaluations demonstrate that our detector outperforms existing methods under identical training conditions, achieving an average accuracy improvement of approximately 11% to 34%. The code is available at https://github.com/Megum1/Co-Spy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18286v1-abstract-full').style.display = 'none'; document.getElementById('2503.18286v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The IEEE/CVF Conference on Computer Vision and Pattern Recognition 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18142">arXiv:2503.18142</a> <span> [<a href="https://arxiv.org/pdf/2503.18142">pdf</a>, <a href="https://arxiv.org/format/2503.18142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LocDiffusion: Identifying Locations on Earth by Diffusing in the Hilbert Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jielu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhongliang Zhou</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Q">Qian Cao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+N">Nemin Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeping Liu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+L">Lan Mu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yang Song</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiqun Xie</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+N">Ni Lao</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+G">Gengchen Mai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18142v1-abstract-short" style="display: inline;"> Image geolocalization is a fundamental yet challenging task, aiming at inferring the geolocation on Earth where an image is taken. Existing methods approach it either via grid-based classification or via image retrieval. Their performance significantly suffers when the spatial distribution of test images does not align with such choices. To address these limitations, we propose to leverage diffusi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18142v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18142v1-abstract-full" style="display: none;"> Image geolocalization is a fundamental yet challenging task, aiming at inferring the geolocation on Earth where an image is taken. Existing methods approach it either via grid-based classification or via image retrieval. Their performance significantly suffers when the spatial distribution of test images does not align with such choices. To address these limitations, we propose to leverage diffusion as a mechanism for image geolocalization. To avoid the problematic manifold reprojection step in diffusion, we developed a novel spherical positional encoding-decoding framework, which encodes points on a spherical surface (e.g., geolocations on Earth) into a Hilbert space of Spherical Harmonics coefficients and decodes points (geolocations) by mode-seeking. We call this type of position encoding Spherical Harmonics Dirac Delta (SHDD) Representation. We also propose a novel SirenNet-based architecture called CS-UNet to learn the conditional backward process in the latent SHDD space by minimizing a latent KL-divergence loss. We train a conditional latent diffusion model called LocDiffusion that generates geolocations under the guidance of images -- to the best of our knowledge, the first generative model for image geolocalization by diffusing geolocation information in a hidden location embedding space. We evaluate our method against SOTA image geolocalization baselines. LocDiffusion achieves competitive geolocalization performance and demonstrates significantly stronger generalizability to unseen geolocations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18142v1-abstract-full').style.display = 'none'; document.getElementById('2503.18142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18094">arXiv:2503.18094</a> <span> [<a href="https://arxiv.org/pdf/2503.18094">pdf</a>, <a href="https://arxiv.org/format/2503.18094">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Anomize: Better Open Vocabulary Video Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+F">Fei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruixu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18094v1-abstract-short" style="display: inline;"> Open Vocabulary Video Anomaly Detection (OVVAD) seeks to detect and classify both base and novel anomalies. However, existing methods face two specific challenges related to novel anomalies. The first challenge is detection ambiguity, where the model struggles to assign accurate anomaly scores to unfamiliar anomalies. The second challenge is categorization confusion, where novel anomalies are ofte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18094v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18094v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18094v1-abstract-full" style="display: none;"> Open Vocabulary Video Anomaly Detection (OVVAD) seeks to detect and classify both base and novel anomalies. However, existing methods face two specific challenges related to novel anomalies. The first challenge is detection ambiguity, where the model struggles to assign accurate anomaly scores to unfamiliar anomalies. The second challenge is categorization confusion, where novel anomalies are often misclassified as visually similar base instances. To address these challenges, we explore supplementary information from multiple sources to mitigate detection ambiguity by leveraging multiple levels of visual data alongside matching textual information. Furthermore, we propose incorporating label relations to guide the encoding of new labels, thereby improving alignment between novel videos and their corresponding labels, which helps reduce categorization confusion. The resulting Anomize framework effectively tackles these issues, achieving superior performance on UCF-Crime and XD-Violence datasets, demonstrating its effectiveness in OVVAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18094v1-abstract-full').style.display = 'none'; document.getElementById('2503.18094v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17924">arXiv:2503.17924</a> <span> [<a href="https://arxiv.org/pdf/2503.17924">pdf</a>, <a href="https://arxiv.org/format/2503.17924">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> WLB-LLM: Workload-Balanced 4D Parallelism for Large Language Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+A">Anna Cai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xinfeng Xie</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zaifeng Pan</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Y">Yue Guan</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+W">Weiwei Chu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jie Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shikai Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianyu Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+C">Chris Cai</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yuchen Hao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yufei Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17924v1-abstract-short" style="display: inline;"> In this work, we present WLB-LLM, a workLoad-balanced 4D parallelism for large language model training. We first thoroughly analyze the workload imbalance issue in LLM training and identify two primary sources of imbalance at the pipeline parallelism and context parallelism levels. Then, to address the imbalance issue, at the pipeline parallelism level, WLB-LLM incorporates a workload-aware variab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17924v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17924v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17924v1-abstract-full" style="display: none;"> In this work, we present WLB-LLM, a workLoad-balanced 4D parallelism for large language model training. We first thoroughly analyze the workload imbalance issue in LLM training and identify two primary sources of imbalance at the pipeline parallelism and context parallelism levels. Then, to address the imbalance issue, at the pipeline parallelism level, WLB-LLM incorporates a workload-aware variable-length document packing method to balance the computation and communication workload across micro-batches. Additionally, at the context parallelism level, WLB-LLM introduces a novel fine-grained per-document sharding strategy, ensuring each worker within a context parallelism group has an identical workload. Comprehensive experiments under different model scales demonstrate that WLB-LLM significantly mitigates the workload imbalance during 4D parallelism LLM training and achieves an average speedup of 1.23x when applying WLB-LLM in our internal LLM training framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17924v1-abstract-full').style.display = 'none'; document.getElementById('2503.17924v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 16 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.11 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17908">arXiv:2503.17908</a> <span> [<a href="https://arxiv.org/pdf/2503.17908">pdf</a>, <a href="https://arxiv.org/format/2503.17908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Does GCL Need a Large Number of Negative Samples? Enhancing Graph Contrastive Learning with Effective and Efficient Negative Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yongqi Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jitao Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Dongxiao He</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Di Jin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuxiao Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17908v1-abstract-short" style="display: inline;"> Graph Contrastive Learning (GCL) aims to self-supervised learn low-dimensional graph representations, primarily through instance discrimination, which involves manually mining positive and negative pairs from graphs, increasing the similarity of positive pairs while decreasing negative pairs. Drawing from the success of Contrastive Learning (CL) in other domains, a consensus has been reached that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17908v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17908v1-abstract-full" style="display: none;"> Graph Contrastive Learning (GCL) aims to self-supervised learn low-dimensional graph representations, primarily through instance discrimination, which involves manually mining positive and negative pairs from graphs, increasing the similarity of positive pairs while decreasing negative pairs. Drawing from the success of Contrastive Learning (CL) in other domains, a consensus has been reached that the effectiveness of GCLs depends on a large number of negative pairs. As a result, despite the significant computational overhead, GCLs typically leverage as many negative node pairs as possible to improve model performance. However, given that nodes within a graph are interconnected, we argue that nodes cannot be treated as independent instances. Therefore, we challenge this consensus: Does employing more negative nodes lead to a more effective GCL model? To answer this, we explore the role of negative nodes in the commonly used InfoNCE loss for GCL and observe that: (1) Counterintuitively, a large number of negative nodes can actually hinder the model's ability to distinguish nodes with different semantics. (2) A smaller number of high-quality and non-topologically coupled negative nodes are sufficient to enhance the discriminability of representations. Based on these findings, we propose a new method called GCL with Effective and Efficient Negative samples, E2Neg, which learns discriminative representations using only a very small set of representative negative samples. E2Neg significantly reduces computational overhead and speeds up model training. We demonstrate the effectiveness and efficiency of E2Neg across multiple datasets compared to other GCL methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17908v1-abstract-full').style.display = 'none'; document.getElementById('2503.17908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17743">arXiv:2503.17743</a> <span> [<a href="https://arxiv.org/pdf/2503.17743">pdf</a>, <a href="https://arxiv.org/format/2503.17743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Neutron particle transport 3D method of characteristic Multi GPU platform Parallel Computing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Faguo Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shunde Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+R">Rong Xue</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+L">Lingkun Bu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+N">Ningming Nie</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+P">Peng Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jue Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yun Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongguo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yangang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qinmeng Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Miao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17743v1-abstract-short" style="display: inline;"> Three-dimensional neutron transport calculations using the Method of Characteristics (MOC) are highly regarded for their exceptional computational efficiency, precision, and stability. Nevertheless, when dealing with extensive-scale computations, the computational demands are substantial, leading to prolonged computation times. To address this challenge while considering GPU memory limitations, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17743v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17743v1-abstract-full" style="display: none;"> Three-dimensional neutron transport calculations using the Method of Characteristics (MOC) are highly regarded for their exceptional computational efficiency, precision, and stability. Nevertheless, when dealing with extensive-scale computations, the computational demands are substantial, leading to prolonged computation times. To address this challenge while considering GPU memory limitations, this study transplants the real-time generation and characteristic line computation techniques onto the GPU platform. Empirical evidence emphasizes that the GPU-optimized approach maintains a heightened level of precision in computation results and produces a significant acceleration effect. Furthermore, to fully harness the computational capabilities of GPUs, a dual approach involving characteristic line preloading and load balancing mechanisms is adopted, further enhancing computational efficiency. The resulting increase in computational efficiency, compared to traditional methods, reaches an impressive 300 to 400-fold improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17743v1-abstract-full').style.display = 'none'; document.getElementById('2503.17743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures. Submitted to a peer-reviewed journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17724">arXiv:2503.17724</a> <span> [<a href="https://arxiv.org/pdf/2503.17724">pdf</a>, <a href="https://arxiv.org/format/2503.17724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Invisible Backdoor Attack on Text-to-Image Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongqi Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+S">Shiguang Shan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17724v1-abstract-short" style="display: inline;"> Backdoor attacks targeting text-to-image diffusion models have advanced rapidly, enabling attackers to implant malicious triggers into these models to manipulate their outputs. However, current backdoor samples often exhibit two key abnormalities compared to benign samples: 1) Semantic Consistency, where backdoor prompts tend to generate images with similar semantic content even with significant t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17724v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17724v1-abstract-full" style="display: none;"> Backdoor attacks targeting text-to-image diffusion models have advanced rapidly, enabling attackers to implant malicious triggers into these models to manipulate their outputs. However, current backdoor samples often exhibit two key abnormalities compared to benign samples: 1) Semantic Consistency, where backdoor prompts tend to generate images with similar semantic content even with significant textual variations to the prompts; 2) Attention Consistency, where the trigger induces consistent structural responses in the cross-attention maps. These consistencies leave detectable traces for defenders, making backdoors easier to identify. To enhance the stealthiness of backdoor samples, we propose a novel Invisible Backdoor Attack (IBA) by explicitly mitigating these consistencies. Specifically, our approach leverages syntactic structures as backdoor triggers to amplify the sensitivity to textual variations, effectively breaking down the semantic consistency. Besides, a regularization method based on Kernel Maximum Mean Discrepancy (KMMD) is proposed to align the distribution of cross-attention responses between backdoor and benign samples, thereby disrupting attention consistency. Extensive experiments demonstrate that our IBA achieves a 97.5% attack success rate while exhibiting stronger resistance to defenses, with an average of over 98% backdoor samples bypassing three state-of-the-art detection mechanisms. The code is available at https://github.com/Robin-WZQ/IBA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17724v1-abstract-full').style.display = 'none'; document.getElementById('2503.17724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17604">arXiv:2503.17604</a> <span> [<a href="https://arxiv.org/pdf/2503.17604">pdf</a>, <a href="https://arxiv.org/format/2503.17604">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OmniScience: A Domain-Specialized LLM for Scientific Reasoning and Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prabhakar%2C+V">Vignesh Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M+A">Md Amirul Islam</a>, <a href="/search/cs?searchtype=author&query=Atanas%2C+A">Adam Atanas</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yao-Ting Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Joah Han</a>, <a href="/search/cs?searchtype=author&query=Jhunjhunwala%2C+A">Aastha Jhunjhunwala</a>, <a href="/search/cs?searchtype=author&query=Apte%2C+R">Rucha Apte</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+R">Robert Clark</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kai Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17604v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable potential in advancing scientific knowledge and addressing complex challenges. In this work, we introduce OmniScience, a specialized large reasoning model for general science, developed through three key components: (1) domain adaptive pretraining on a carefully curated corpus of scientific literature, (2) instruction tuning on a specialize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17604v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17604v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17604v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable potential in advancing scientific knowledge and addressing complex challenges. In this work, we introduce OmniScience, a specialized large reasoning model for general science, developed through three key components: (1) domain adaptive pretraining on a carefully curated corpus of scientific literature, (2) instruction tuning on a specialized dataset to guide the model in following domain-specific tasks, and (3) reasoning-based knowledge distillation through fine-tuning to significantly enhance its ability to generate contextually relevant and logically sound responses. We demonstrate the versatility of OmniScience by developing a battery agent that efficiently ranks molecules as potential electrolyte solvents or additives. Comprehensive evaluations reveal that OmniScience is competitive with state-of-the-art large reasoning models on the GPQA Diamond and domain-specific battery benchmarks, while outperforming all public reasoning and non-reasoning models with similar parameter counts. We further demonstrate via ablation experiments that domain adaptive pretraining and reasoning-based knowledge distillation are critical to attain our performance levels, across benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17604v1-abstract-full').style.display = 'none'; document.getElementById('2503.17604v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17467">arXiv:2503.17467</a> <span> [<a href="https://arxiv.org/pdf/2503.17467">pdf</a>, <a href="https://arxiv.org/format/2503.17467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2025.3552049">10.1109/TCSVT.2025.3552049 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> High Efficiency Wiener Filter-based Point Cloud Quality Enhancement for MPEG G-PCC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yuxuan Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehan Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+T">Tian Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Liquan Shen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17467v1-abstract-short" style="display: inline;"> Point clouds, which directly record the geometry and attributes of scenes or objects by a large number of points, are widely used in various applications such as virtual reality and immersive communication. However, due to the huge data volume and unstructured geometry, efficient compression of point clouds is very crucial. The Moving Picture Expert Group is establishing a geometry-based point clo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17467v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17467v1-abstract-full" style="display: none;"> Point clouds, which directly record the geometry and attributes of scenes or objects by a large number of points, are widely used in various applications such as virtual reality and immersive communication. However, due to the huge data volume and unstructured geometry, efficient compression of point clouds is very crucial. The Moving Picture Expert Group is establishing a geometry-based point cloud compression (G-PCC) standard for both static and dynamic point clouds in recent years. Although lossy compression of G-PCC can achieve a very high compression ratio, the reconstruction quality is relatively low, especially at low bitrates. To mitigate this problem, we propose a high efficiency Wiener filter that can be integrated into the encoder and decoder pipeline of G-PCC to improve the reconstruction quality as well as the rate-distortion performance for dynamic point clouds. Specifically, we first propose a basic Wiener filter, and then improve it by introducing coefficients inheritance and variance-based point classification for the Luma component. Besides, to reduce the complexity of the nearest neighbor search during the application of the Wiener filter, we also propose a Morton code-based fast nearest neighbor search algorithm for efficient calculation of filter coefficients. Experimental results demonstrate that the proposed method can achieve average Bj酶ntegaard delta rates of -6.1%, -7.3%, and -8.0% for Luma, Chroma Cb, and Chroma Cr components, respectively, under the condition of lossless-geometry-lossy-attributes configuration compared to the latest G-PCC encoding platform (i.e., geometry-based solid content test model version 7.0 release candidate 2) by consuming affordable computational complexity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17467v1-abstract-full').style.display = 'none'; document.getElementById('2503.17467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17407">arXiv:2503.17407</a> <span> [<a href="https://arxiv.org/pdf/2503.17407">pdf</a>, <a href="https://arxiv.org/format/2503.17407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Survey on Long Context Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dawei Zhu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zhiqi Bai</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Huanxuan Liao</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiebin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hangyu Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziqiang Liu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yong Shan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jiayi Tian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhejian Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+R">Ruijie Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shizhu He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhoujun Li</a> , et al. (12 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17407v1-abstract-short" style="display: inline;"> Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) that can process and analyze extensive inputs in an effective and efficient way. In this paper, we present a comprehensive survey on recent advances in long-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17407v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17407v1-abstract-full" style="display: none;"> Efficient processing of long contexts has been a persistent pursuit in Natural Language Processing. With the growing number of long documents, dialogues, and other textual data, it is important to develop Long Context Language Models (LCLMs) that can process and analyze extensive inputs in an effective and efficient way. In this paper, we present a comprehensive survey on recent advances in long-context modeling for large language models. Our survey is structured around three key aspects: how to obtain effective and efficient LCLMs, how to train and deploy LCLMs efficiently, and how to evaluate and analyze LCLMs comprehensively. For the first aspect, we discuss data strategies, architectural designs, and workflow approaches oriented with long context processing. For the second aspect, we provide a detailed examination of the infrastructure required for LCLM training and inference. For the third aspect, we present evaluation paradigms for long-context comprehension and long-form generation, as well as behavioral analysis and mechanism interpretability of LCLMs. Beyond these three key aspects, we thoroughly explore the diverse application scenarios where existing LCLMs have been deployed and outline promising future development directions. This survey provides an up-to-date review of the literature on long-context LLMs, which we wish to serve as a valuable resource for both researchers and engineers. An associated GitHub repository collecting the latest papers and repos is available at: \href{https://github.com/LCLM-Horizon/A-Comprehensive-Survey-For-Long-Context-Language-Modeling}{\color[RGB]{175,36,67}{LCLM-Horizon}}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17407v1-abstract-full').style.display = 'none'; document.getElementById('2503.17407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17261">arXiv:2503.17261</a> <span> [<a href="https://arxiv.org/pdf/2503.17261">pdf</a>, <a href="https://arxiv.org/format/2503.17261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Modal Interactive Perception Network with Mamba for Lung Tumor Segmentation in PET-CT Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jie Mei</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenyu Lin</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yu Qiu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaonan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyang Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Dong Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17261v1-abstract-short" style="display: inline;"> Lung cancer is a leading cause of cancer-related deaths globally. PET-CT is crucial for imaging lung tumors, providing essential metabolic and anatomical information, while it faces challenges such as poor image quality, motion artifacts, and complex tumor morphology. Deep learning-based models are expected to address these problems, however, existing small-scale and private datasets limit signifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17261v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17261v1-abstract-full" style="display: none;"> Lung cancer is a leading cause of cancer-related deaths globally. PET-CT is crucial for imaging lung tumors, providing essential metabolic and anatomical information, while it faces challenges such as poor image quality, motion artifacts, and complex tumor morphology. Deep learning-based models are expected to address these problems, however, existing small-scale and private datasets limit significant performance improvements for these methods. Hence, we introduce a large-scale PET-CT lung tumor segmentation dataset, termed PCLT20K, which comprises 21,930 pairs of PET-CT images from 605 patients. Furthermore, we propose a cross-modal interactive perception network with Mamba (CIPA) for lung tumor segmentation in PET-CT images. Specifically, we design a channel-wise rectification module (CRM) that implements a channel state space block across multi-modal features to learn correlated representations and helps filter out modality-specific noise. A dynamic cross-modality interaction module (DCIM) is designed to effectively integrate position and context information, which employs PET images to learn regional position information and serves as a bridge to assist in modeling the relationships between local features of CT images. Extensive experiments on a comprehensive benchmark demonstrate the effectiveness of our CIPA compared to the current state-of-the-art segmentation methods. We hope our research can provide more exploration opportunities for medical image segmentation. The dataset and code are available at https://github.com/mj129/CIPA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17261v1-abstract-full').style.display = 'none'; document.getElementById('2503.17261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17079">arXiv:2503.17079</a> <span> [<a href="https://arxiv.org/pdf/2503.17079">pdf</a>, <a href="https://arxiv.org/format/2503.17079">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Interference Identification in Multi-User Optical Spectrum as a Service using Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Raj%2C+A">Agastya Raj</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehao Wang</a>, <a href="/search/cs?searchtype=author&query=Slyne%2C+F">Frank Slyne</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tingjun Chen</a>, <a href="/search/cs?searchtype=author&query=Kilper%2C+D">Dan Kilper</a>, <a href="/search/cs?searchtype=author&query=Ruffini%2C+M">Marco Ruffini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17079v1-abstract-short" style="display: inline;"> We introduce a ML-based architecture for network operators to detect impairments from specific OSaaS users while blind to the users' internal spectrum details. Experimental studies with three OSaaS users demonstrate the model's capability to accurately classify the source of impairments, achieving classification accuracy of 94.2%. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17079v1-abstract-full" style="display: none;"> We introduce a ML-based architecture for network operators to detect impairments from specific OSaaS users while blind to the users' internal spectrum details. Experimental studies with three OSaaS users demonstrate the model's capability to accurately classify the source of impairments, achieving classification accuracy of 94.2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17079v1-abstract-full').style.display = 'none'; document.getElementById('2503.17079v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is a preprint of a paper accepted to ECOC 2024 and is subject to Institution of Engineering and Technology Copyright. A copy of record will be available at IET Digital Library</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17072">arXiv:2503.17072</a> <span> [<a href="https://arxiv.org/pdf/2503.17072">pdf</a>, <a href="https://arxiv.org/format/2503.17072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Span Optical Power Spectrum Evolution Modeling using ML-based Multi-Decoder Attention Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Raj%2C+A">Agastya Raj</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehao Wang</a>, <a href="/search/cs?searchtype=author&query=Slyne%2C+F">Frank Slyne</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tingjun Chen</a>, <a href="/search/cs?searchtype=author&query=Kilper%2C+D">Dan Kilper</a>, <a href="/search/cs?searchtype=author&query=Ruffini%2C+M">Marco Ruffini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17072v1-abstract-short" style="display: inline;"> We implement a ML-based attention framework with component-specific decoders, improving optical power spectrum prediction in multi-span networks. By reducing the need for in-depth training on each component, the framework can be scaled to multi-span topologies with minimal data collection, making it suitable for brown-field scenarios. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17072v1-abstract-full" style="display: none;"> We implement a ML-based attention framework with component-specific decoders, improving optical power spectrum prediction in multi-span networks. By reducing the need for in-depth training on each component, the framework can be scaled to multi-span topologies with minimal data collection, making it suitable for brown-field scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17072v1-abstract-full').style.display = 'none'; document.getElementById('2503.17072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is a preprint of a paper accepted in ECOC 2024 and is subject to Institution of Engineering and Technology Copyright. A copy of record will be available at IET Digital Library</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16979">arXiv:2503.16979</a> <span> [<a href="https://arxiv.org/pdf/2503.16979">pdf</a>, <a href="https://arxiv.org/format/2503.16979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Instant Gaussian Stream: Fast and Generalizable Streaming of Dynamic Scene Reconstruction via Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jinbo Yan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+R">Rui Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyan Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Luyang Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiayu Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jie Liang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiahao Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16979v1-abstract-short" style="display: inline;"> Building Free-Viewpoint Videos in a streaming manner offers the advantage of rapid responsiveness compared to offline training methods, greatly enhancing user experience. However, current streaming approaches face challenges of high per-frame reconstruction time (10s+) and error accumulation, limiting their broader application. In this paper, we propose Instant Gaussian Stream (IGS), a fast and ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16979v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16979v1-abstract-full" style="display: none;"> Building Free-Viewpoint Videos in a streaming manner offers the advantage of rapid responsiveness compared to offline training methods, greatly enhancing user experience. However, current streaming approaches face challenges of high per-frame reconstruction time (10s+) and error accumulation, limiting their broader application. In this paper, we propose Instant Gaussian Stream (IGS), a fast and generalizable streaming framework, to address these issues. First, we introduce a generalized Anchor-driven Gaussian Motion Network, which projects multi-view 2D motion features into 3D space, using anchor points to drive the motion of all Gaussians. This generalized Network generates the motion of Gaussians for each target frame in the time required for a single inference. Second, we propose a Key-frame-guided Streaming Strategy that refines each key frame, enabling accurate reconstruction of temporally complex scenes while mitigating error accumulation. We conducted extensive in-domain and cross-domain evaluations, demonstrating that our approach can achieve streaming with a average per-frame reconstruction time of 2s+, alongside a enhancement in view synthesis quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16979v1-abstract-full').style.display = 'none'; document.getElementById('2503.16979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16948">arXiv:2503.16948</a> <span> [<a href="https://arxiv.org/pdf/2503.16948">pdf</a>, <a href="https://arxiv.org/format/2503.16948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MagicColor: Multi-Instance Sketch Colorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yue Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qifeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16948v1-abstract-short" style="display: inline;"> We present \textit{MagicColor}, a diffusion-based framework for multi-instance sketch colorization. The production of multi-instance 2D line art colorization adheres to an industry-standard workflow, which consists of three crucial stages: the design of line art characters, the coloring of individual objects, and the refinement process. The artists are required to repeat the process of coloring ea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16948v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16948v1-abstract-full" style="display: none;"> We present \textit{MagicColor}, a diffusion-based framework for multi-instance sketch colorization. The production of multi-instance 2D line art colorization adheres to an industry-standard workflow, which consists of three crucial stages: the design of line art characters, the coloring of individual objects, and the refinement process. The artists are required to repeat the process of coloring each instance one by one, which is inaccurate and inefficient. Meanwhile, current generative methods fail to solve this task due to the challenge of multi-instance pair data collection. To tackle these challenges, we incorporate three technical designs to ensure precise character detail transcription and achieve multi-instance sketch colorization in a single forward. Specifically, we first propose the self-play training strategy to solve the lack of training data. Then we introduce an instance guider to feed the color of the instance. To achieve accurate color matching, we present fine-grained color matching with edge loss to enhance visual quality. Equipped with the proposed modules, MagicColor enables automatically transforming sketches into vividly-colored images with accurate consistency and multi-instance control. Experiments on our collected datasets show that our model outperforms existing methods regarding chromatic precision. Specifically, our model critically automates the colorization process with zero manual adjustments, so novice users can produce stylistically consistent artwork by providing reference instances and the original line art. Our code and additional details are available at https://yinhan-zhang.github.io/color <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16948v1-abstract-full').style.display = 'none'; document.getElementById('2503.16948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16914">arXiv:2503.16914</a> <span> [<a href="https://arxiv.org/pdf/2503.16914">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A New Segment Routing method with Swap Node Selection Strategy Based on Deep Reinforcement Learning for Software Defined Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+M">Miao Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jihao Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qiuxiang Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuan Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16914v1-abstract-short" style="display: inline;"> The existing segment routing (SR) methods need to determine the routing first and then use path segmentation approaches to select swap nodes to form a segment routing path (SRP). They require re-segmentation of the path when the routing changes. Furthermore, they do not consider the flow table issuance time, which cannot maximize the speed of issuance flow table. To address these issues, this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16914v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16914v1-abstract-full" style="display: none;"> The existing segment routing (SR) methods need to determine the routing first and then use path segmentation approaches to select swap nodes to form a segment routing path (SRP). They require re-segmentation of the path when the routing changes. Furthermore, they do not consider the flow table issuance time, which cannot maximize the speed of issuance flow table. To address these issues, this paper establishes an optimization model that can simultaneously form routing strategies and path segmentation strategies for selecting the appropriate swap nodes to reduce flow table issuance time. It also designs an intelligent segment routing algorithm based on deep reinforcement learning (DRL-SR) to solve the proposed model. First, a traffic matrix is designed as the state space for the deep reinforcement learning agent; this matrix includes multiple QoS performance indicators, flow table issuance time overhead and SR label stack depth. Second, the action selection strategy and corresponding reward function are designed, where the agent selects the next node considering the routing; in addition, the action selection strategy whether the newly added node is selected as the swap node and the corresponding reward function are designed considering the time cost factor for the controller to issue the flow table to the swap node. Finally, a series of experiments and their results show that, compared with the existing methods, the designed segmented route optimization model and the intelligent solution algorithm (DRL-SR) can reduce the time overhead required to complete the segmented route establishment task while optimizing performance metrics such as throughput, delays and packet losses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16914v1-abstract-full').style.display = 'none'; document.getElementById('2503.16914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16905">arXiv:2503.16905</a> <span> [<a href="https://arxiv.org/pdf/2503.16905">pdf</a>, <a href="https://arxiv.org/format/2503.16905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MAPS: A Multi-Agent Framework Based on Big Seven Personality and Socratic Guidance for Multimodal Scientific Problem Solving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhangqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fangzhi Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qika Lin</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+R">Rui Mao</a>, <a href="/search/cs?searchtype=author&query=Cambria%2C+E">Erik Cambria</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16905v1-abstract-short" style="display: inline;"> Multimodal scientific problems (MSPs) involve complex issues that require the integration of multiple modalities, such as text and diagrams, presenting a significant challenge in artificial intelligence. While progress has been made in addressing traditional scientific problems, MSPs still face two primary issues: the challenge of multi-modal comprehensive reasoning in scientific problem-solving a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16905v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16905v1-abstract-full" style="display: none;"> Multimodal scientific problems (MSPs) involve complex issues that require the integration of multiple modalities, such as text and diagrams, presenting a significant challenge in artificial intelligence. While progress has been made in addressing traditional scientific problems, MSPs still face two primary issues: the challenge of multi-modal comprehensive reasoning in scientific problem-solving and the lack of reflective and rethinking capabilities. To address these issues, we introduce a Multi-Agent framework based on the Big Seven Personality and Socratic guidance (MAPS). This framework employs seven distinct agents that leverage feedback mechanisms and the Socratic method to guide the resolution of MSPs. To tackle the first issue, we propose a progressive four-agent solving strategy, where each agent focuses on a specific stage of the problem-solving process. For the second issue, we introduce a Critic agent, inspired by Socratic questioning, which prompts critical thinking and stimulates autonomous learning. We conduct extensive experiments on the EMMA, Olympiad, and MathVista datasets, achieving promising results that outperform the current SOTA model by 15.84% across all tasks. Meanwhile, the additional analytical experiments also verify the model's progress as well as generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16905v1-abstract-full').style.display = 'none'; document.getElementById('2503.16905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository