Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,575 results for author: <span class="mathjax">Sun, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Sun%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Sun, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Sun%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Sun, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.24148">arXiv:2503.24148</a> <span> [<a href="https://arxiv.org/pdf/2503.24148">pdf</a>, <a href="https://arxiv.org/format/2503.24148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TNET.2024.3495660">10.1109/TNET.2024.3495660 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Trident: Interference Avoidance in Multi-reader Backscatter Network via Frequency-space Division </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yang Zou</a>, <a href="/search/cs?searchtype=author&query=Na%2C+X">Xin Na</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yimiao Sun</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuan He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.24148v1-abstract-short" style="display: inline;"> Backscatter is a key technology for battery-free sensing in industrial IoT applications. To fully cover numerous tags in the deployment area, one often needs to deploy multiple readers, each of which communicates with tags within its communication range. However, the actual backscattered signals from a tag are likely to reach a reader outside its communication range and cause interference. Convent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24148v1-abstract-full').style.display = 'inline'; document.getElementById('2503.24148v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.24148v1-abstract-full" style="display: none;"> Backscatter is a key technology for battery-free sensing in industrial IoT applications. To fully cover numerous tags in the deployment area, one often needs to deploy multiple readers, each of which communicates with tags within its communication range. However, the actual backscattered signals from a tag are likely to reach a reader outside its communication range and cause interference. Conventional TDMA or CSMA based approaches for interference avoidance separate readers' media access in time, leading to limited network throughput. In this paper, we propose TRIDENT, a novel backscatter design that enables interference avoidance via frequency-space division. By incorporating a tunable bandpass filter and multiple terminal loads, a TRIDENT tag can detect its channel condition and adaptively adjust the frequency and the power of its backscattered signals. We further propose a frequency assignment algorithm for the readers. With these designs, all the readers in the network can operate concurrently without being interfered. We implement TRIDENT and evaluate its performance under various settings. The results demonstrate that TRIDENT enhances the network throughput by 3.18x, compared to the TDMA-based scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.24148v1-abstract-full').style.display = 'none'; document.getElementById('2503.24148v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> TRANS-01 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23609">arXiv:2503.23609</a> <span> [<a href="https://arxiv.org/pdf/2503.23609">pdf</a>, <a href="https://arxiv.org/format/2503.23609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3711058">10.1145/3711058 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Rethinking Technological Solutions for Community-Based Older Adult Care: Insights from 'Older Partners' in China </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuing Sun</a>, <a href="/search/cs?searchtype=author&query=Ankenbauer%2C+S+A">Sam Addison Ankenbauer</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhifan Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuchen Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaojuan Ma</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Liang He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23609v1-abstract-short" style="display: inline;"> Aging in place refers to the enabling of individuals to age comfortably and securely within their own homes and communities. Aging in place relies on robust infrastructure, prompting the development and implementation of both human-led care services and information and communication technologies to provide support. Through a long-term ethnographic study that includes semi-structured interviews wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23609v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23609v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23609v1-abstract-full" style="display: none;"> Aging in place refers to the enabling of individuals to age comfortably and securely within their own homes and communities. Aging in place relies on robust infrastructure, prompting the development and implementation of both human-led care services and information and communication technologies to provide support. Through a long-term ethnographic study that includes semi-structured interviews with 24 stakeholders, we consider these human- and technology-driven care infrastructures for aging in place, examining their origins, deployment, interactions with older adults, and challenges. In doing so, we reconsider the value of these different forms of older adult care, highlighting the various issues associated with using, for instance, health monitoring technology or appointment scheduling systems to care for older adults aging in place. We suggest that technology should take a supportive, not substitutive role in older adult care infrastructure. Furthermore, we note that designing for aging in place should move beyond a narrow focus on independence in one's home to instead encompass the broader community and its dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23609v1-abstract-full').style.display = 'none'; document.getElementById('2503.23609v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CSCW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23327">arXiv:2503.23327</a> <span> [<a href="https://arxiv.org/pdf/2503.23327">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> AI Delivers Creative Output but Struggles with Thinking Processes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Man Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yang Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yijia Sun</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+W">Wenxin Guo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huiqing Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingbai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23327v1-abstract-short" style="display: inline;"> A key objective in artificial intelligence (AI) development is to create systems that match or surpass human creativity. Although current AI models perform well across diverse creative tasks, it remains unclear whether these achievements reflect genuine creative thinking. This study examined whether AI models (GPT-3.5-turbo, GPT-4, and GPT-4o) engage in creative thinking by comparing their perform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23327v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23327v1-abstract-full" style="display: none;"> A key objective in artificial intelligence (AI) development is to create systems that match or surpass human creativity. Although current AI models perform well across diverse creative tasks, it remains unclear whether these achievements reflect genuine creative thinking. This study examined whether AI models (GPT-3.5-turbo, GPT-4, and GPT-4o) engage in creative thinking by comparing their performance with humans across various creative tasks and core cognitive processes. Results showed that AI models outperformed humans in divergent thinking, convergent thinking, and insight problem-solving, but underperformed in creative writing. Compared to humans, AI generated lower forward flow values in both free and chain association tasks and showed lower accuracy in the representational change task. In creative evaluation, AI exhibited no significant correlation between the weights of novelty and appropriateness when predicting creative ratings, suggesting the absence of a human-like trade-off strategy. AI also had higher decision error scores in creative selection, suggesting difficulty identifying the most creative ideas. These findings suggest that while AI can mimic human creativity, its strong performance in creative tasks is likely driven by non-creative mechanisms rather than genuine creative thinking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23327v1-abstract-full').style.display = 'none'; document.getElementById('2503.23327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23139">arXiv:2503.23139</a> <span> [<a href="https://arxiv.org/pdf/2503.23139">pdf</a>, <a href="https://arxiv.org/ps/2503.23139">ps</a>, <a href="https://arxiv.org/format/2503.23139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> On-Off Systems with Strategic Customers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanwei Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhe Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chiwei Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23139v1-abstract-short" style="display: inline;"> Motivated by applications such as urban traffic control and make-to-order systems, we study a fluid model of a single-server, on-off system that can accommodate multiple queues. The server visits each queue in order: when a queue is served, it is "on", and when the server is serving another queue or transitioning between queues, it is "off". Customers arrive over time, observe the state of the sys… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23139v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23139v1-abstract-full" style="display: none;"> Motivated by applications such as urban traffic control and make-to-order systems, we study a fluid model of a single-server, on-off system that can accommodate multiple queues. The server visits each queue in order: when a queue is served, it is "on", and when the server is serving another queue or transitioning between queues, it is "off". Customers arrive over time, observe the state of the system, and decide whether to join. We consider two regimes for the formation of the on and off durations. In the exogenous setting, each queue's on and off durations are predetermined. We explicitly characterize the equilibrium outcome in closed form and give a compact linear program to compute the optimal on-off durations that maximizes total reward collected from serving customers. In the endogenous setting, the durations depend on customers' joining decisions under an exhaustive service policy where the server never leaves a non-empty queue. We show that an optimal policy in this case extends service beyond the first clearance for at most one queue. Using this property, we introduce a closed-form procedure that computes an optimal policy in no more than 2n steps for a system with n queues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23139v1-abstract-full').style.display = 'none'; document.getElementById('2503.23139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22998">arXiv:2503.22998</a> <span> [<a href="https://arxiv.org/pdf/2503.22998">pdf</a>, <a href="https://arxiv.org/format/2503.22998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> AuditVotes: A Framework Towards More Deployable Certified Robustness for Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Y">Yuni Lai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yulin Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yixuan Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yulun Wu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+B">Bin Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gaolei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianhua Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kai Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22998v1-abstract-short" style="display: inline;"> Despite advancements in Graph Neural Networks (GNNs), adaptive attacks continue to challenge their robustness. Certified robustness based on randomized smoothing has emerged as a promising solution, offering provable guarantees that a model's predictions remain stable under adversarial perturbations within a specified range. However, existing methods face a critical trade-off between accuracy and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22998v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22998v1-abstract-full" style="display: none;"> Despite advancements in Graph Neural Networks (GNNs), adaptive attacks continue to challenge their robustness. Certified robustness based on randomized smoothing has emerged as a promising solution, offering provable guarantees that a model's predictions remain stable under adversarial perturbations within a specified range. However, existing methods face a critical trade-off between accuracy and robustness, as achieving stronger robustness requires introducing greater noise into the input graph. This excessive randomization degrades data quality and disrupts prediction consistency, limiting the practical deployment of certifiably robust GNNs in real-world scenarios where both accuracy and robustness are essential. To address this challenge, we propose \textbf{AuditVotes}, the first framework to achieve both high clean accuracy and certifiably robust accuracy for GNNs. It integrates randomized smoothing with two key components, \underline{au}gmentation and con\underline{dit}ional smoothing, aiming to improve data quality and prediction consistency. The augmentation, acting as a pre-processing step, de-noises the randomized graph, significantly improving data quality and clean accuracy. The conditional smoothing, serving as a post-processing step, employs a filtering function to selectively count votes, thereby filtering low-quality predictions and improving voting consistency. Extensive experimental results demonstrate that AuditVotes significantly enhances clean accuracy, certified robustness, and empirical robustness while maintaining high computational efficiency. Notably, compared to baseline randomized smoothing, AuditVotes improves clean accuracy by $437.1\%$ and certified accuracy by $409.3\%$ when the attacker can arbitrarily insert $20$ edges on the Cora-ML datasets, representing a substantial step toward deploying certifiably robust GNNs in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22998v1-abstract-full').style.display = 'none'; document.getElementById('2503.22998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22747">arXiv:2503.22747</a> <span> [<a href="https://arxiv.org/pdf/2503.22747">pdf</a>, <a href="https://arxiv.org/format/2503.22747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> LeForecast: Enterprise Hybrid Forecast by Time Series Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zheng Tan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yiwen Nie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenfa Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanze Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xinyuan Tian</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kailin Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengya Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qijiang Cheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haipeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yingzheng Ma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wei Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuci Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuanyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+X">Xiangyu Lei</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiyu Guan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wanqing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shouming Liu</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangquan Meng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+P">Pengzhan Qu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chao Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiaxuan Fan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuan He</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Hongsheng Qi</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yangzhou Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22747v1-abstract-short" style="display: inline;"> Demand is spiking in industrial fields for multidisciplinary forecasting, where a broad spectrum of sectors needs planning and forecasts to streamline intelligent business management, such as demand forecasting, product planning, inventory optimization, etc. Specifically, these tasks expecting intelligent approaches to learn from sequentially collected historical data and then foresee most possibl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22747v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22747v1-abstract-full" style="display: none;"> Demand is spiking in industrial fields for multidisciplinary forecasting, where a broad spectrum of sectors needs planning and forecasts to streamline intelligent business management, such as demand forecasting, product planning, inventory optimization, etc. Specifically, these tasks expecting intelligent approaches to learn from sequentially collected historical data and then foresee most possible trend, i.e. time series forecasting. Challenge of it lies in interpreting complex business contexts and the efficiency and generalisation of modelling. With aspirations of pre-trained foundational models for such purpose, given their remarkable success of large foundation model across legions of tasks, we disseminate \leforecast{}, an enterprise intelligence platform tailored for time series tasks. It integrates advanced interpretations of time series data and multi-source information, and a three-pillar modelling engine combining a large foundation model (Le-TSFM), multimodal model and hybrid model to derive insights, predict or infer futures, and then drive optimisation across multiple sectors in enterprise operations. The framework is composed by a model pool, model profiling module, and two different fusion approaches regarding original model architectures. Experimental results verify the efficiency of our trail fusion concepts: router-based fusion network and coordination of large and small models, resulting in high costs for redundant development and maintenance of models. This work reviews deployment of LeForecast and its performance in three industrial use cases. Our comprehensive experiments indicate that LeForecast is a profound and practical platform for efficient and competitive performance. And we do hope that this work can enlighten the research and grounding of time series techniques in accelerating enterprise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22747v1-abstract-full').style.display = 'none'; document.getElementById('2503.22747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21847">arXiv:2503.21847</a> <span> [<a href="https://arxiv.org/pdf/2503.21847">pdf</a>, <a href="https://arxiv.org/format/2503.21847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ReCoM: Realistic Co-Speech Motion Generation with Recurrent Embedded Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yong Xie</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yunlian Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yebin Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21847v1-abstract-short" style="display: inline;"> We present ReCoM, an efficient framework for generating high-fidelity and generalizable human body motions synchronized with speech. The core innovation lies in the Recurrent Embedded Transformer (RET), which integrates Dynamic Embedding Regularization (DER) into a Vision Transformer (ViT) core architecture to explicitly model co-speech motion dynamics. This architecture enables joint spatial-temp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21847v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21847v1-abstract-full" style="display: none;"> We present ReCoM, an efficient framework for generating high-fidelity and generalizable human body motions synchronized with speech. The core innovation lies in the Recurrent Embedded Transformer (RET), which integrates Dynamic Embedding Regularization (DER) into a Vision Transformer (ViT) core architecture to explicitly model co-speech motion dynamics. This architecture enables joint spatial-temporal dependency modeling, thereby enhancing gesture naturalness and fidelity through coherent motion synthesis. To enhance model robustness, we incorporate the proposed DER strategy, which equips the model with dual capabilities of noise resistance and cross-domain generalization, thereby improving the naturalness and fluency of zero-shot motion generation for unseen speech inputs. To mitigate inherent limitations of autoregressive inference, including error accumulation and limited self-correction, we propose an iterative reconstruction inference (IRI) strategy. IRI refines motion sequences via cyclic pose reconstruction, driven by two key components: (1) classifier-free guidance improves distribution alignment between generated and real gestures without auxiliary supervision, and (2) a temporal smoothing process eliminates abrupt inter-frame transitions while ensuring kinematic continuity. Extensive experiments on benchmark datasets validate ReCoM's effectiveness, achieving state-of-the-art performance across metrics. Notably, it reduces the Fr茅chet Gesture Distance (FGD) from 18.70 to 2.48, demonstrating an 86.7% improvement in motion realism. Our project page is https://yong-xie-xy.github.io/ReCoM/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21847v1-abstract-full').style.display = 'none'; document.getElementById('2503.21847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, Project Page: https://yong-xie-xy.github.io/ReCoM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21588">arXiv:2503.21588</a> <span> [<a href="https://arxiv.org/pdf/2503.21588">pdf</a>, <a href="https://arxiv.org/format/2503.21588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Implicit Neural Representations via Parameterized Latent Dynamics for Baroclinic Ocean Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guang Zhao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xihaier Luo</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Seungjun Lee</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yihui Ren</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a>, <a href="/search/cs?searchtype=author&query=Van+Roekel%2C+L">Luke Van Roekel</a>, <a href="/search/cs?searchtype=author&query=Nadiga%2C+B">Balu Nadiga</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S+H+K">Sri Hari Krishna Narayanan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yixuan Sun</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wei Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21588v1-abstract-short" style="display: inline;"> Mesoscale ocean dynamics play a critical role in climate systems, governing heat transport, hurricane genesis, and drought patterns. However, simulating these processes at high resolution remains computationally prohibitive due to their nonlinear, multiscale nature and vast spatiotemporal domains. Implicit neural representations (INRs) reduce the computational costs as resolution-independent surro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21588v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21588v1-abstract-full" style="display: none;"> Mesoscale ocean dynamics play a critical role in climate systems, governing heat transport, hurricane genesis, and drought patterns. However, simulating these processes at high resolution remains computationally prohibitive due to their nonlinear, multiscale nature and vast spatiotemporal domains. Implicit neural representations (INRs) reduce the computational costs as resolution-independent surrogates but fail in many-query scenarios (inverse modeling) requiring rapid evaluations across diverse parameters. We present PINROD, a novel framework combining dynamics-aware implicit neural representations with parameterized neural ordinary differential equations to address these limitations. By integrating parametric dependencies into latent dynamics, our method efficiently captures nonlinear oceanic behavior across varying boundary conditions and physical parameters. Experiments on ocean mesoscale activity data show superior accuracy over existing baselines and improved computational efficiency compared to standard numerical simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21588v1-abstract-full').style.display = 'none'; document.getElementById('2503.21588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21469">arXiv:2503.21469</a> <span> [<a href="https://arxiv.org/pdf/2503.21469">pdf</a>, <a href="https://arxiv.org/format/2503.21469">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Embedding Compression Distortion in Video Coding for Machines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuxiao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Meiqin Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Chao Yao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weisi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21469v1-abstract-short" style="display: inline;"> Currently, video transmission serves not only the Human Visual System (HVS) for viewing but also machine perception for analysis. However, existing codecs are primarily optimized for pixel-domain and HVS-perception metrics rather than the needs of machine vision tasks. To address this issue, we propose a Compression Distortion Representation Embedding (CDRE) framework, which extracts machine-perce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21469v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21469v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21469v1-abstract-full" style="display: none;"> Currently, video transmission serves not only the Human Visual System (HVS) for viewing but also machine perception for analysis. However, existing codecs are primarily optimized for pixel-domain and HVS-perception metrics rather than the needs of machine vision tasks. To address this issue, we propose a Compression Distortion Representation Embedding (CDRE) framework, which extracts machine-perception-related distortion representation and embeds it into downstream models, addressing the information lost during compression and improving task performance. Specifically, to better analyze the machine-perception-related distortion, we design a compression-sensitive extractor that identifies compression degradation in the feature domain. For efficient transmission, a lightweight distortion codec is introduced to compress the distortion information into a compact representation. Subsequently, the representation is progressively embedded into the downstream model, enabling it to be better informed about compression degradation and enhancing performance. Experiments across various codecs and downstream tasks demonstrate that our framework can effectively boost the rate-task performance of existing codecs with minimal overhead in terms of bitrate, execution time, and number of parameters. Our codes and supplementary materials are released in https://github.com/Ws-Syx/CDRE/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21469v1-abstract-full').style.display = 'none'; document.getElementById('2503.21469v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21307">arXiv:2503.21307</a> <span> [<a href="https://arxiv.org/pdf/2503.21307">pdf</a>, <a href="https://arxiv.org/format/2503.21307">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> InternVL-X: Advancing and Accelerating InternVL Series with Efficient Visual Token Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dongchen Lu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuyao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zilu Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Leping Huang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jianliang Zeng</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Mao Shu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Huo Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21307v1-abstract-short" style="display: inline;"> Most multimodal large language models (MLLMs) treat visual tokens as "a sequence of text", integrating them with text tokens into a large language model (LLM). However, a great quantity of visual tokens significantly increases the demand for computational resources and time. In this paper, we propose InternVL-X, which outperforms the InternVL model in both performance and efficiency by incorporati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21307v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21307v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21307v1-abstract-full" style="display: none;"> Most multimodal large language models (MLLMs) treat visual tokens as "a sequence of text", integrating them with text tokens into a large language model (LLM). However, a great quantity of visual tokens significantly increases the demand for computational resources and time. In this paper, we propose InternVL-X, which outperforms the InternVL model in both performance and efficiency by incorporating three visual token compression methods. First, we propose a novel vision-language projector, PVTC. This component integrates adjacent visual embeddings to form a local query and utilizes the transformed CLS token as a global query, then performs point-to-region cross-attention through these local and global queries to more effectively convert visual features. Second, we present a layer-wise visual token compression module, LVTC, which compresses tokens in the LLM shallow layers and then expands them through upsampling and residual connections in the deeper layers. This significantly enhances the model computational efficiency. Futhermore, we propose an efficient high resolution slicing method, RVTC, which dynamically adjusts the number of visual tokens based on image area or length filtering. RVTC greatly enhances training efficiency with only a slight reduction in performance. By utilizing 20% or fewer visual tokens, InternVL-X achieves state-of-the-art performance on 7 public MLLM benchmarks, and improves the average metric by 2.34% across 12 tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21307v1-abstract-full').style.display = 'none'; document.getElementById('2503.21307v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20820">arXiv:2503.20820</a> <span> [<a href="https://arxiv.org/pdf/2503.20820">pdf</a>, <a href="https://arxiv.org/format/2503.20820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Multi-Object Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianze Chen</a>, <a href="/search/cs?searchtype=author&query=Frumento%2C+R">Ricardo Frumento</a>, <a href="/search/cs?searchtype=author&query=Pagnanelli%2C+G">Giulia Pagnanelli</a>, <a href="/search/cs?searchtype=author&query=Cei%2C+G">Gianmarco Cei</a>, <a href="/search/cs?searchtype=author&query=Keth%2C+V">Villa Keth</a>, <a href="/search/cs?searchtype=author&query=Gafarov%2C+S">Shahaddin Gafarov</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jian Gong</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zihe Ye</a>, <a href="/search/cs?searchtype=author&query=Baracca%2C+M">Marco Baracca</a>, <a href="/search/cs?searchtype=author&query=D%27Avella%2C+S">Salvatore D'Avella</a>, <a href="/search/cs?searchtype=author&query=Bianchi%2C+M">Matteo Bianchi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20820v2-abstract-short" style="display: inline;"> In this work, we describe a multi-object grasping benchmark to evaluate the grasping and manipulation capabilities of robotic systems in both pile and surface scenarios. The benchmark introduces three robot multi-object grasping benchmarking protocols designed to challenge different aspects of robotic manipulation. These protocols are: 1) the Only-Pick-Once protocol, which assesses the robot's abi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20820v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20820v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20820v2-abstract-full" style="display: none;"> In this work, we describe a multi-object grasping benchmark to evaluate the grasping and manipulation capabilities of robotic systems in both pile and surface scenarios. The benchmark introduces three robot multi-object grasping benchmarking protocols designed to challenge different aspects of robotic manipulation. These protocols are: 1) the Only-Pick-Once protocol, which assesses the robot's ability to efficiently pick multiple objects in a single attempt; 2) the Accurate pick-trnsferring protocol, which evaluates the robot's capacity to selectively grasp and transport a specific number of objects from a cluttered environment; and 3) the Pick-transferring-all protocol, which challenges the robot to clear an entire scene by sequentially grasping and transferring all available objects. These protocols are intended to be adopted by the broader robotics research community, providing a standardized method to assess and compare robotic systems' performance in multi-object grasping tasks. We establish baselines for these protocols using standard planning and perception algorithms on a Barrett hand, Robotiq parallel jar gripper, and the Pisa/IIT Softhand-2, which is a soft underactuated robotic hand. We discuss the results in relation to human performance in similar tasks we well. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20820v2-abstract-full').style.display = 'none'; document.getElementById('2503.20820v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper contains 11 pages and 5 figures. This paper is under review of a robotics journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20745">arXiv:2503.20745</a> <span> [<a href="https://arxiv.org/pdf/2503.20745">pdf</a>, <a href="https://arxiv.org/format/2503.20745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MATHGLANCE: Multimodal Large Language Models Do Not Know Where to Look in Mathematical Diagrams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanpeng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Aotian Chen</a>, <a href="/search/cs?searchtype=author&query=Koniusz%2C+P">Piotr Koniusz</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+K">Kai Zou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yuan Xue</a>, <a href="/search/cs?searchtype=author&query=Hengel%2C+A+v+d">Anton van den Hengel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20745v1-abstract-short" style="display: inline;"> Diagrams serve as a fundamental form of visual language, representing complex concepts and their inter-relationships through structured symbols, shapes, and spatial arrangements. Unlike natural images, their inherently symbolic and abstract nature poses significant challenges for Multimodal Large Language Models (MLLMs). However, current benchmarks conflate perceptual and reasoning tasks, making i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20745v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20745v1-abstract-full" style="display: none;"> Diagrams serve as a fundamental form of visual language, representing complex concepts and their inter-relationships through structured symbols, shapes, and spatial arrangements. Unlike natural images, their inherently symbolic and abstract nature poses significant challenges for Multimodal Large Language Models (MLLMs). However, current benchmarks conflate perceptual and reasoning tasks, making it difficult to assess whether MLLMs genuinely understand mathematical diagrams beyond superficial pattern recognition. To address this gap, we introduce MATHGLANCE, a benchmark specifically designed to isolate and evaluate mathematical perception in MLLMs. MATHGLANCE comprises 1.2K images and 1.6K carefully curated questions spanning four perception tasks: shape classification, object counting, relationship identification, and object grounding, covering diverse domains including plane geometry, solid geometry, and graphical representations. Our evaluation of MLLMs reveals that their ability to understand diagrams is notably limited, particularly in fine-grained grounding tasks. In response, we construct GeoPeP, a perception-oriented dataset of 200K structured geometry image-text pairs explicitly annotated with geometric primitives and precise spatial relationships. Training MLLM on GeoPeP leads to significant gains in perceptual accuracy, which in turn substantially improves mathematical reasoning. Our benchmark and dataset establish critical standards for evaluating and advancing multimodal mathematical understanding, providing valuable resources and insights to foster future MLLM research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20745v1-abstract-full').style.display = 'none'; document.getElementById('2503.20745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20673">arXiv:2503.20673</a> <span> [<a href="https://arxiv.org/pdf/2503.20673">pdf</a>, <a href="https://arxiv.org/format/2503.20673">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Low-Level Visual Hallucinations Requires Self-Awareness: Database, Model and Training Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yinan Sun</a>, <a href="/search/cs?searchtype=author&query=Min%2C+X">Xiongkuo Min</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yixuan Gao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuqin Cao</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangtao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20673v2-abstract-short" style="display: inline;"> The rapid development of multimodal large language models has resulted in remarkable advancements in visual perception and understanding, consolidating several tasks into a single visual question-answering framework. However, these models are prone to hallucinations, which limit their reliability as artificial intelligence systems. While this issue is extensively researched in natural language pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20673v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20673v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20673v2-abstract-full" style="display: none;"> The rapid development of multimodal large language models has resulted in remarkable advancements in visual perception and understanding, consolidating several tasks into a single visual question-answering framework. However, these models are prone to hallucinations, which limit their reliability as artificial intelligence systems. While this issue is extensively researched in natural language processing and image captioning, there remains a lack of investigation of hallucinations in Low-level Visual Perception and Understanding (HLPU), especially in the context of image quality assessment tasks. We consider that these hallucinations arise from an absence of clear self-awareness within the models. To address this issue, we first introduce the HLPU instruction database, the first instruction database specifically focused on hallucinations in low-level vision tasks. This database contains approximately 200K question-answer pairs and comprises four subsets, each covering different types of instructions. Subsequently, we propose the Self-Awareness Failure Elimination (SAFEQA) model, which utilizes image features, salient region features and quality features to improve the perception and comprehension abilities of the model in low-level vision tasks. Furthermore, we propose the Enhancing Self-Awareness Preference Optimization (ESA-PO) framework to increase the model's awareness of knowledge boundaries, thereby mitigating the incidence of hallucination. Finally, we conduct comprehensive experiments on low-level vision tasks, with the results demonstrating that our proposed method significantly enhances self-awareness of the model in these tasks and reduces hallucinations. Notably, our proposed method improves both accuracy and self-awareness of the proposed model and outperforms close-source models in terms of various evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20673v2-abstract-full').style.display = 'none'; document.getElementById('2503.20673v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19990">arXiv:2503.19990</a> <span> [<a href="https://arxiv.org/pdf/2503.19990">pdf</a>, <a href="https://arxiv.org/format/2503.19990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LEGO-Puzzles: How Good Are MLLMs at Multi-Step Spatial Reasoning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+K">Kexian Tang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyao Gao</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yanhong Zeng</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+H">Haodong Duan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanan Sun</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Z">Zhening Xing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenran Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+K">Kaifeng Lyu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19990v1-abstract-short" style="display: inline;"> Multi-step spatial reasoning entails understanding and reasoning about spatial relationships across multiple sequential steps, which is crucial for tackling complex real-world applications, such as robotic manipulation, autonomous navigation, and automated assembly. To assess how well current Multimodal Large Language Models (MLLMs) have acquired this fundamental capability, we introduce \textbf{L… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19990v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19990v1-abstract-full" style="display: none;"> Multi-step spatial reasoning entails understanding and reasoning about spatial relationships across multiple sequential steps, which is crucial for tackling complex real-world applications, such as robotic manipulation, autonomous navigation, and automated assembly. To assess how well current Multimodal Large Language Models (MLLMs) have acquired this fundamental capability, we introduce \textbf{LEGO-Puzzles}, a scalable benchmark designed to evaluate both \textbf{spatial understanding} and \textbf{sequential reasoning} in MLLMs through LEGO-based tasks. LEGO-Puzzles consists of 1,100 carefully curated visual question-answering (VQA) samples spanning 11 distinct tasks, ranging from basic spatial understanding to complex multi-step reasoning. Based on LEGO-Puzzles, we conduct a comprehensive evaluation of state-of-the-art MLLMs and uncover significant limitations in their spatial reasoning capabilities: even the most powerful MLLMs can answer only about half of the test cases, whereas human participants achieve over 90\% accuracy. In addition to VQA tasks, we evaluate MLLMs' abilities to generate LEGO images following assembly illustrations. Our experiments show that only Gemini-2.0-Flash and GPT-4o exhibit a limited ability to follow these instructions, while other MLLMs either replicate the input image or generate completely irrelevant outputs. Overall, LEGO-Puzzles exposes critical deficiencies in existing MLLMs' spatial understanding and sequential reasoning capabilities, and underscores the need for further advancements in multimodal spatial reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19990v1-abstract-full').style.display = 'none'; document.getElementById('2503.19990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19824">arXiv:2503.19824</a> <span> [<a href="https://arxiv.org/pdf/2503.19824">pdf</a>, <a href="https://arxiv.org/format/2503.19824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> AudCast: Audio-Driven Human Video Generation by Cascaded Diffusion Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jiazhi Guan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaisiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhiliang Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Quanwei Yang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yasheng Sun</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengyi He</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Borong Liang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yukang Cao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingying Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+H">Haocheng Feng</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+E">Errui Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Youjian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hang Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19824v1-abstract-short" style="display: inline;"> Despite the recent progress of audio-driven video generation, existing methods mostly focus on driving facial movements, leading to non-coherent head and body dynamics. Moving forward, it is desirable yet challenging to generate holistic human videos with both accurate lip-sync and delicate co-speech gestures w.r.t. given audio. In this work, we propose AudCast, a generalized audio-driven human vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19824v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19824v1-abstract-full" style="display: none;"> Despite the recent progress of audio-driven video generation, existing methods mostly focus on driving facial movements, leading to non-coherent head and body dynamics. Moving forward, it is desirable yet challenging to generate holistic human videos with both accurate lip-sync and delicate co-speech gestures w.r.t. given audio. In this work, we propose AudCast, a generalized audio-driven human video generation framework adopting a cascade Diffusion-Transformers (DiTs) paradigm, which synthesizes holistic human videos based on a reference image and a given audio. 1) Firstly, an audio-conditioned Holistic Human DiT architecture is proposed to directly drive the movements of any human body with vivid gesture dynamics. 2) Then to enhance hand and face details that are well-knownly difficult to handle, a Regional Refinement DiT leverages regional 3D fitting as the bridge to reform the signals, producing the final results. Extensive experiments demonstrate that our framework generates high-fidelity audio-driven holistic human videos with temporal coherence and fine facial and hand details. Resources can be found at https://guanjz20.github.io/projects/AudCast. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19824v1-abstract-full').style.display = 'none'; document.getElementById('2503.19824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025. Project page: https://guanjz20.github.io/projects/AudCast</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19316">arXiv:2503.19316</a> <span> [<a href="https://arxiv.org/pdf/2503.19316">pdf</a>, <a href="https://arxiv.org/format/2503.19316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Social Dynamical System for Twitter Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yifang Qin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zijie Huang</a>, <a href="/search/cs?searchtype=author&query=Porter%2C+M+A">Mason A. Porter</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yizhou Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19316v3-abstract-short" style="display: inline;"> Understanding the evolution of public opinion is crucial for informed decision-making in various domains, particularly public affairs. The rapid growth of social networks, such as Twitter (now rebranded as X), provides an unprecedented opportunity to analyze public opinion at scale without relying on traditional surveys. With the rise of deep learning, Graph Neural Networks (GNNs) have shown great… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19316v3-abstract-full').style.display = 'inline'; document.getElementById('2503.19316v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19316v3-abstract-full" style="display: none;"> Understanding the evolution of public opinion is crucial for informed decision-making in various domains, particularly public affairs. The rapid growth of social networks, such as Twitter (now rebranded as X), provides an unprecedented opportunity to analyze public opinion at scale without relying on traditional surveys. With the rise of deep learning, Graph Neural Networks (GNNs) have shown great promise in modeling online opinion dynamics. Notably, classical opinion dynamics models, such as DeGroot, can be reformulated within a GNN framework. We introduce Latent Social Dynamical System (LSDS), a novel framework for modeling the latent dynamics of social media users' opinions based on textual content. Since expressed opinions may not fully reflect underlying beliefs, LSDS first encodes post content into latent representations. It then leverages a GraphODE framework, using a GNN-based ODE function to predict future opinions. A decoder subsequently utilizes these predicted latent opinions to perform downstream tasks, such as interaction prediction, which serve as benchmarks for model evaluation. Our framework is highly flexible, supporting various opinion dynamic models as ODE functions, provided they can be adapted into a GNN-based form. It also accommodates different encoder architectures and is compatible with diverse downstream tasks. To validate our approach, we constructed dynamic datasets from Twitter data. Experimental results demonstrate the effectiveness of LSDS, highlighting its potential for future applications. We plan to publicly release our dataset and code upon the publication of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19316v3-abstract-full').style.display = 'none'; document.getElementById('2503.19316v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">will be submitted to a journal soon</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 34-04; 37N99 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.4; K.4.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19267">arXiv:2503.19267</a> <span> [<a href="https://arxiv.org/pdf/2503.19267">pdf</a>, <a href="https://arxiv.org/format/2503.19267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NeoRL-2: Near Real-World Benchmarks for Offline Reinforcement Learning with Extended Realistic Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+S">Songyi Gao</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zuolin Tu</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+R">Rong-Jun Qin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yi-Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiong-Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19267v1-abstract-short" style="display: inline;"> Offline reinforcement learning (RL) aims to learn from historical data without requiring (costly) access to the environment. To facilitate offline RL research, we previously introduced NeoRL, which highlighted that datasets from real-world tasks are often conservative and limited. With years of experience applying offline RL to various domains, we have identified additional real-world challenges.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19267v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19267v1-abstract-full" style="display: none;"> Offline reinforcement learning (RL) aims to learn from historical data without requiring (costly) access to the environment. To facilitate offline RL research, we previously introduced NeoRL, which highlighted that datasets from real-world tasks are often conservative and limited. With years of experience applying offline RL to various domains, we have identified additional real-world challenges. These include extremely conservative data distributions produced by deployed control systems, delayed action effects caused by high-latency transitions, external factors arising from the uncontrollable variance of transitions, and global safety constraints that are difficult to evaluate during the decision-making process. These challenges are underrepresented in previous benchmarks but frequently occur in real-world tasks. To address this, we constructed the extended Near Real-World Offline RL Benchmark (NeoRL-2), which consists of 7 datasets from 7 simulated tasks along with their corresponding evaluation simulators. Benchmarking results from state-of-the-art offline RL approaches demonstrate that current methods often struggle to outperform the data-collection behavior policy, highlighting the need for more effective methods. We hope NeoRL-2 will accelerate the development of reinforcement learning algorithms for real-world applications. The benchmark project page is available at https://github.com/polixir/NeoRL2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19267v1-abstract-full').style.display = 'none'; document.getElementById('2503.19267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18874">arXiv:2503.18874</a> <span> [<a href="https://arxiv.org/pdf/2503.18874">pdf</a>, <a href="https://arxiv.org/format/2503.18874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A semantic communication-based workload-adjustable transceiver for wireless AI-generated content (AIGC) delivery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+R">Runze Cheng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lan Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lei Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Imran%2C+M+A">Muhammad Ali Imran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18874v1-abstract-short" style="display: inline;"> With the significant advances in generative AI (GAI) and the proliferation of mobile devices, providing high-quality AI-generated content (AIGC) services via wireless networks is becoming the future direction. However, the primary challenges of AIGC service delivery in wireless networks lie in unstable channels, limited bandwidth resources, and unevenly distributed computational resources. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18874v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18874v1-abstract-full" style="display: none;"> With the significant advances in generative AI (GAI) and the proliferation of mobile devices, providing high-quality AI-generated content (AIGC) services via wireless networks is becoming the future direction. However, the primary challenges of AIGC service delivery in wireless networks lie in unstable channels, limited bandwidth resources, and unevenly distributed computational resources. In this paper, we employ semantic communication (SemCom) in diffusion-based GAI models to propose a Resource-aware wOrkload-adjUstable TransceivEr (ROUTE) for AIGC delivery in dynamic wireless networks. Specifically, to relieve the communication resource bottleneck, SemCom is utilized to prioritize semantic information of the generated content. Then, to improve computational resource utilization in both edge and local and reduce AIGC semantic distortion in transmission, modified diffusion-based models are applied to adjust the computing workload and semantic density in cooperative content generation. Simulations verify the superiority of our proposed ROUTE in terms of latency and content quality compared to conventional AIGC approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18874v1-abstract-full').style.display = 'none'; document.getElementById('2503.18874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18794">arXiv:2503.18794</a> <span> [<a href="https://arxiv.org/pdf/2503.18794">pdf</a>, <a href="https://arxiv.org/format/2503.18794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NexusGS: Sparse View Synthesis with Epipolar Depth Priors in 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yulong Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zicheng Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengfeng He</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yandu Sun</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Junyu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yong Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18794v1-abstract-short" style="display: inline;"> Neural Radiance Field (NeRF) and 3D Gaussian Splatting (3DGS) have noticeably advanced photo-realistic novel view synthesis using images from densely spaced camera viewpoints. However, these methods struggle in few-shot scenarios due to limited supervision. In this paper, we present NexusGS, a 3DGS-based approach that enhances novel view synthesis from sparse-view images by directly embedding dept… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18794v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18794v1-abstract-full" style="display: none;"> Neural Radiance Field (NeRF) and 3D Gaussian Splatting (3DGS) have noticeably advanced photo-realistic novel view synthesis using images from densely spaced camera viewpoints. However, these methods struggle in few-shot scenarios due to limited supervision. In this paper, we present NexusGS, a 3DGS-based approach that enhances novel view synthesis from sparse-view images by directly embedding depth information into point clouds, without relying on complex manual regularizations. Exploiting the inherent epipolar geometry of 3DGS, our method introduces a novel point cloud densification strategy that initializes 3DGS with a dense point cloud, reducing randomness in point placement while preventing over-smoothing and overfitting. Specifically, NexusGS comprises three key steps: Epipolar Depth Nexus, Flow-Resilient Depth Blending, and Flow-Filtered Depth Pruning. These steps leverage optical flow and camera poses to compute accurate depth maps, while mitigating the inaccuracies often associated with optical flow. By incorporating epipolar depth priors, NexusGS ensures reliable dense point cloud coverage and supports stable 3DGS training under sparse-view conditions. Experiments demonstrate that NexusGS significantly enhances depth accuracy and rendering quality, surpassing state-of-the-art methods by a considerable margin. Furthermore, we validate the superiority of our generated point clouds by substantially boosting the performance of competing methods. Project page: https://usmizuki.github.io/NexusGS/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18794v1-abstract-full').style.display = 'none'; document.getElementById('2503.18794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18034">arXiv:2503.18034</a> <span> [<a href="https://arxiv.org/pdf/2503.18034">pdf</a>, <a href="https://arxiv.org/format/2503.18034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Expanding the Boundaries of Vision Prior Knowledge in Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Q">Qiao Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanjiang Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jia Zheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yingfei Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18034v1-abstract-short" style="display: inline;"> Does the prior knowledge of the vision encoder constrain the capability boundary of Multi-modal Large Language Models (MLLMs)? While most existing research treats MLLMs as unified systems optimized through end-to-end training, the impact of vision encoder's prior knowledge is seldom investigated. In this work, we introduce a novel metric, $Rank_e$, to quantify the effect of the vision encoder's pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18034v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18034v1-abstract-full" style="display: none;"> Does the prior knowledge of the vision encoder constrain the capability boundary of Multi-modal Large Language Models (MLLMs)? While most existing research treats MLLMs as unified systems optimized through end-to-end training, the impact of vision encoder's prior knowledge is seldom investigated. In this work, we introduce a novel metric, $Rank_e$, to quantify the effect of the vision encoder's prior knowledge on MLLM performance. Our analysis reveals a positive correlation between prior knowledge and MLLM performance. Moreover, we find that domain-specific fine-tuning using solely end-to-end visual question answering (VQA) data is insufficient--particularly for entities with low inherent visual prior knowledge. To address this issue, we propose VisPRE (Vision Prior Remediation), a two-stage training framework that explicitly incorporates prior knowledge at the vision encoder level. Experimental results demonstrate that augmenting vision encoder's prior knowledge substantially boosts the visual understanding capabilities of MLLMs, offering a novel and effective strategy for improving performance, especially in scenarios involving uncommon visual entities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18034v1-abstract-full').style.display = 'none'; document.getElementById('2503.18034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17913">arXiv:2503.17913</a> <span> [<a href="https://arxiv.org/pdf/2503.17913">pdf</a>, <a href="https://arxiv.org/ps/2503.17913">ps</a>, <a href="https://arxiv.org/format/2503.17913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TVT.2024.3463548">10.1109/TVT.2024.3463548 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Cache-Aware Cooperative Multicast Beamforming in Dynamic Satellite-Terrestrial Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shuo Yuan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yaohua Sun</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+M">Mugen Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17913v1-abstract-short" style="display: inline;"> With the burgeoning demand for data-intensive services, satellite-terrestrial networks (STNs) face increasing backhaul link congestion, deteriorating user quality of service (QoS), and escalating power consumption. Cache-aided STNs are acknowledged as a promising paradigm for accelerating content delivery to users and alleviating the load of backhaul links. However, the dynamic nature of low earth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17913v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17913v1-abstract-full" style="display: none;"> With the burgeoning demand for data-intensive services, satellite-terrestrial networks (STNs) face increasing backhaul link congestion, deteriorating user quality of service (QoS), and escalating power consumption. Cache-aided STNs are acknowledged as a promising paradigm for accelerating content delivery to users and alleviating the load of backhaul links. However, the dynamic nature of low earth orbit (LEO) satellites and the complex interference among satellite beams and terrestrial base stations pose challenges in effectively managing limited edge resources. To address these issues, this paper proposes a method for dynamically scheduling caching and communication resources, aiming to reduce network costs in terms of transmission power consumption and backhaul traffic, while meeting user QoS demands and resource constraints. We formulate a mixed timescale problem to jointly optimize cache placement, LEO satellite beam direction, and cooperative multicast beamforming among satellite beams and base stations. To tackle this intricate problem, we propose a two-stage solution framework, where the primary problem is decoupled into a short-term content delivery subproblem and a long-term cache placement subproblem. The former subproblem is solved by designing an alternating optimization approach with whale optimization and successive convex approximation methods according to the cache placement state, while cache content in STNs is updated using an iterative algorithm that utilizes historical information. Simulation results demonstrate the effectiveness of our proposed algorithms, showcasing their convergence and significantly reducing transmission power consumption and backhaul traffic by up to 52%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17913v1-abstract-full').style.display = 'none'; document.getElementById('2503.17913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Vehicular Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17912">arXiv:2503.17912</a> <span> [<a href="https://arxiv.org/pdf/2503.17912">pdf</a>, <a href="https://arxiv.org/format/2503.17912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/MWC.011.2400262">10.1109/MWC.011.2400262 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Satellite-Terrestrial Integrated Fog Networks: Architecture, Technologies, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shuo Yuan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+M">Mugen Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yaohua Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17912v1-abstract-short" style="display: inline;"> In the evolution of sixth-generation (6G) mobile communication networks, satellite-terrestrial integrated networks emerge as a promising paradigm, characterized by their wide coverage and reliable transmission capabilities. By integrating with cloud-based terrestrial mobile communication networks, the limitations of low Earth orbit (LEO) satellites, such as insufficient onboard computing capabilit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17912v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17912v1-abstract-full" style="display: none;"> In the evolution of sixth-generation (6G) mobile communication networks, satellite-terrestrial integrated networks emerge as a promising paradigm, characterized by their wide coverage and reliable transmission capabilities. By integrating with cloud-based terrestrial mobile communication networks, the limitations of low Earth orbit (LEO) satellites, such as insufficient onboard computing capabilities and limited inter-satellite link capacity, can be addressed. In addition, to efficiently respond to the diverse integrated tasks of communication, remote sensing, and navigation, LEO constellations need to be capable of autonomous networking. To this end, this article presents a satellite-terrestrial integrated fog network for 6G. Its system architecture and key technologies are introduced to achieve flexible collaboration between fog satellites and terrestrial cloud computing centers. In particular, key techniques with diverse challenges and their corresponding solutions are discussed, including integrated waveform design and resource management based on fog satellite onboard processing, as well as mobility management and native artificial intelligence based on cloud-fog collaboration. Finally, future challenges and open issues are outlined. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17912v1-abstract-full').style.display = 'none'; document.getElementById('2503.17912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Wireless Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17777">arXiv:2503.17777</a> <span> [<a href="https://arxiv.org/pdf/2503.17777">pdf</a>, <a href="https://arxiv.org/ps/2503.17777">ps</a>, <a href="https://arxiv.org/format/2503.17777">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hierarchy-Aware and Channel-Adaptive Semantic Communication for Bandwidth-Limited Data Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lei Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuxuan Sun</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+B">Bo Ai</a>, <a href="/search/cs?searchtype=author&query=Pappas%2C+N">Nikolaos Pappas</a>, <a href="/search/cs?searchtype=author&query=Quek%2C+T">Tony Quek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17777v1-abstract-short" style="display: inline;"> Obtaining high-resolution hyperspectral images (HR-HSI) is costly and data-intensive, making it necessary to fuse low-resolution hyperspectral images (LR-HSI) with high-resolution RGB images (HR-RGB) for practical applications. However, traditional fusion techniques, which integrate detailed information into the reconstruction, significantly increase bandwidth consumption compared to directly tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17777v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17777v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17777v1-abstract-full" style="display: none;"> Obtaining high-resolution hyperspectral images (HR-HSI) is costly and data-intensive, making it necessary to fuse low-resolution hyperspectral images (LR-HSI) with high-resolution RGB images (HR-RGB) for practical applications. However, traditional fusion techniques, which integrate detailed information into the reconstruction, significantly increase bandwidth consumption compared to directly transmitting raw data. To overcome these challenges, we propose a hierarchy-aware and channel-adaptive semantic communication approach for bandwidth-limited data fusion. A hierarchical correlation module is proposed to preserve both the overall structural information and the details of the image required for super-resolution. This module efficiently combines deep semantic and shallow features from LR-HSI and HR-RGB. To further reduce bandwidth usage while preserving reconstruction quality, a channel-adaptive attention mechanism based on Transformer is proposed to dynamically integrate and transmit the deep and shallow features, enabling efficient data transmission and high-quality HR-HSI reconstruction. Experimental results on the CAVE and Washington DC Mall datasets demonstrate that our method outperforms single-source transmission, achieving up to a 2 dB improvement in peak signal-to-noise ratio (PSNR). Additionally, it reduces bandwidth consumption by two-thirds, confirming its effectiveness in bandwidth-constrained environments for HR-HSI reconstruction tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17777v1-abstract-full').style.display = 'none'; document.getElementById('2503.17777v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the WCL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17709">arXiv:2503.17709</a> <span> [<a href="https://arxiv.org/pdf/2503.17709">pdf</a>, <a href="https://arxiv.org/format/2503.17709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GUI-Xplore: Empowering Generalizable GUI Agents with One Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuchen Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shanhui Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Hao Wen</a>, <a href="/search/cs?searchtype=author&query=Va%2C+S">Samith Va</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mengwei Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanchun Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chongyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17709v1-abstract-short" style="display: inline;"> GUI agents hold significant potential to enhance the experience and efficiency of human-device interaction. However, current methods face challenges in generalizing across applications (apps) and tasks, primarily due to two fundamental limitations in existing datasets. First, these datasets overlook developer-induced structural variations among apps, limiting the transferability of knowledge acros… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17709v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17709v1-abstract-full" style="display: none;"> GUI agents hold significant potential to enhance the experience and efficiency of human-device interaction. However, current methods face challenges in generalizing across applications (apps) and tasks, primarily due to two fundamental limitations in existing datasets. First, these datasets overlook developer-induced structural variations among apps, limiting the transferability of knowledge across diverse software environments. Second, many of them focus solely on navigation tasks, which restricts their capacity to represent comprehensive software architectures and complex user interactions. To address these challenges, we introduce GUI-Xplore, a dataset meticulously designed to enhance cross-application and cross-task generalization via an exploration-and-reasoning framework. GUI-Xplore integrates pre-recorded exploration videos providing contextual insights, alongside five hierarchically structured downstream tasks designed to comprehensively evaluate GUI agent capabilities. To fully exploit GUI-Xplore's unique features, we propose Xplore-Agent, a GUI agent framework that combines Action-aware GUI Modeling with Graph-Guided Environment Reasoning. Further experiments indicate that Xplore-Agent achieves a 10% improvement over existing methods in unfamiliar environments, yet there remains significant potential for further enhancement towards truly generalizable GUI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17709v1-abstract-full').style.display = 'none'; document.getElementById('2503.17709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17551">arXiv:2503.17551</a> <span> [<a href="https://arxiv.org/pdf/2503.17551">pdf</a>, <a href="https://arxiv.org/format/2503.17551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-Enhanced Vision-Language Modeling with Latent Space Broadening for High Quality Data Expansion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yin Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruixiao Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunhui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fangming Zhou</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Ze Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Linjie Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiang Shen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhuolin Hao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hongyu Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17551v1-abstract-short" style="display: inline;"> Transformer-based multimodal models are widely used in industrial-scale recommendation, search, and advertising systems for content understanding and relevance ranking. Enhancing labeled training data quality and cross-modal fusion significantly improves model performance, influencing key metrics such as quality view rates and ad revenue. High-quality annotations are crucial for advancing content… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17551v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17551v1-abstract-full" style="display: none;"> Transformer-based multimodal models are widely used in industrial-scale recommendation, search, and advertising systems for content understanding and relevance ranking. Enhancing labeled training data quality and cross-modal fusion significantly improves model performance, influencing key metrics such as quality view rates and ad revenue. High-quality annotations are crucial for advancing content modeling, yet traditional statistical-based active learning (AL) methods face limitations: they struggle to detect overconfident misclassifications and are less effective in distinguishing semantically similar items in deep neural networks. Additionally, audio information plays an increasing role, especially in short-video platforms, yet most pre-trained multimodal architectures primarily focus on text and images. While training from scratch across all three modalities is possible, it sacrifices the benefits of leveraging existing pre-trained visual-language (VL) and audio models. To address these challenges, we propose kNN-based Latent Space Broadening (LSB) to enhance AL efficiency and Vision-Language Modeling with Audio Enhancement (VLMAE), a mid-fusion approach integrating audio into VL models. This system deployed in production systems, leading to significant business gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17551v1-abstract-full').style.display = 'none'; document.getElementById('2503.17551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17126">arXiv:2503.17126</a> <span> [<a href="https://arxiv.org/pdf/2503.17126">pdf</a>, <a href="https://arxiv.org/format/2503.17126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Modifying Large Language Model Post-Training for Diverse Creative Writing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chung%2C+J+J+Y">John Joon Young Chung</a>, <a href="/search/cs?searchtype=author&query=Padmakumar%2C+V">Vishakh Padmakumar</a>, <a href="/search/cs?searchtype=author&query=Roemmele%2C+M">Melissa Roemmele</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuqian Sun</a>, <a href="/search/cs?searchtype=author&query=Kreminski%2C+M">Max Kreminski</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17126v1-abstract-short" style="display: inline;"> As creative writing tasks do not have singular correct answers, large language models (LLMs) trained to perform these tasks should be able to generate diverse valid outputs. However, LLM post-training often focuses on improving generation quality but neglects to facilitate output diversity. Hence, in creative writing generation, we investigate post-training approaches to promote both output divers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17126v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17126v1-abstract-full" style="display: none;"> As creative writing tasks do not have singular correct answers, large language models (LLMs) trained to perform these tasks should be able to generate diverse valid outputs. However, LLM post-training often focuses on improving generation quality but neglects to facilitate output diversity. Hence, in creative writing generation, we investigate post-training approaches to promote both output diversity and quality. Our core idea is to include deviation -- the degree of difference between a training sample and all other samples with the same prompt -- in the training objective to facilitate learning from rare high-quality instances. By adopting our approach to direct preference optimization (DPO) and odds ratio preference optimization (ORPO), we demonstrate that we can promote the output diversity of trained models while minimally decreasing quality. Our best model with 8B parameters could achieve on-par diversity as a human-created dataset while having output quality similar to the best instruction-tuned models we examined, GPT-4o and DeepSeek-R1. We further validate our approaches with a human evaluation, an ablation, and a comparison to an existing diversification approach, DivPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17126v1-abstract-full').style.display = 'none'; document.getElementById('2503.17126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17005">arXiv:2503.17005</a> <span> [<a href="https://arxiv.org/pdf/2503.17005">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Exploration-Based Precise Mapping for Mobile Robots through Stepwise and Consistent Motions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lei Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Ying Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+K">Kai Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yongkui Sun</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+H">Henry Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17005v1-abstract-short" style="display: inline;"> This paper presents an autonomous exploration framework. It is designed for indoor ground mobile robots that utilize laser Simultaneous Localization and Mapping (SLAM), ensuring process completeness and precise mapping results. For frontier search, the local-global sampling architecture based on multiple Rapidly Exploring Random Trees (RRTs) is employed. Traversability checks during RRT expansion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17005v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17005v1-abstract-full" style="display: none;"> This paper presents an autonomous exploration framework. It is designed for indoor ground mobile robots that utilize laser Simultaneous Localization and Mapping (SLAM), ensuring process completeness and precise mapping results. For frontier search, the local-global sampling architecture based on multiple Rapidly Exploring Random Trees (RRTs) is employed. Traversability checks during RRT expansion and global RRT pruning upon map updates eliminate unreachable frontiers, reducing potential collisions and deadlocks. Adaptive sampling density adjustments, informed by obstacle distribution, enhance exploration coverage potential. For frontier point navigation, a stepwise consistent motion strategy is adopted, wherein the robot strictly drives straight on approximately equidistant line segments in the polyline path and rotates in place at segment junctions. This simplified, decoupled motion pattern improves scan-matching stability and mitigates map drift. For process control, the framework serializes frontier point selection and navigation, avoiding oscillation caused by frequent goal changes in conventional parallelized processes. The waypoint retracing mechanism is introduced to generate repeated observations, triggering loop closure detection and backend optimization in graph-based SLAM, thereby improving map consistency and precision. Experiments in both simulation and real-world scenarios validate the effectiveness of the framework. It achieves improved mapping coverage and precision in more challenging environments compared to baseline 2D exploration algorithms. It also shows robustness in supporting resource-constrained robot platforms and maintaining mapping consistency across various LiDAR field-of-view (FoV) configurations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17005v1-abstract-full').style.display = 'none'; document.getElementById('2503.17005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 11 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16910">arXiv:2503.16910</a> <span> [<a href="https://arxiv.org/pdf/2503.16910">pdf</a>, <a href="https://arxiv.org/format/2503.16910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Salient Object Detection in Traffic Scene through the TSOD10K Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yu Qiu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuhang Sun</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+J">Jie Mei</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+L">Lin Xiao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16910v1-abstract-short" style="display: inline;"> Traffic Salient Object Detection (TSOD) aims to segment the objects critical to driving safety by combining semantic (e.g., collision risks) and visual saliency. Unlike SOD in natural scene images (NSI-SOD), which prioritizes visually distinctive regions, TSOD emphasizes the objects that demand immediate driver attention due to their semantic impact, even with low visual contrast. This dual criter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16910v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16910v1-abstract-full" style="display: none;"> Traffic Salient Object Detection (TSOD) aims to segment the objects critical to driving safety by combining semantic (e.g., collision risks) and visual saliency. Unlike SOD in natural scene images (NSI-SOD), which prioritizes visually distinctive regions, TSOD emphasizes the objects that demand immediate driver attention due to their semantic impact, even with low visual contrast. This dual criterion, i.e., bridging perception and contextual risk, re-defines saliency for autonomous and assisted driving systems. To address the lack of task-specific benchmarks, we collect the first large-scale TSOD dataset with pixel-wise saliency annotations, named TSOD10K. TSOD10K covers the diverse object categories in various real-world traffic scenes under various challenging weather/illumination variations (e.g., fog, snowstorms, low-contrast, and low-light). Methodologically, we propose a Mamba-based TSOD model, termed Tramba. Considering the challenge of distinguishing inconspicuous visual information from complex traffic backgrounds, Tramba introduces a novel Dual-Frequency Visual State Space module equipped with shifted window partitioning and dilated scanning to enhance the perception of fine details and global structure by hierarchically decomposing high/low-frequency components. To emphasize critical regions in traffic scenes, we propose a traffic-oriented Helix 2D-Selective-Scan (Helix-SS2D) mechanism that injects driving attention priors while effectively capturing global multi-direction spatial dependencies. We establish a comprehensive benchmark by evaluating Tramba and 22 existing NSI-SOD models on TSOD10K, demonstrating Tramba's superiority. Our research establishes the first foundation for safety-aware saliency analysis in intelligent transportation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16910v1-abstract-full').style.display = 'none'; document.getElementById('2503.16910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16867">arXiv:2503.16867</a> <span> [<a href="https://arxiv.org/pdf/2503.16867">pdf</a>, <a href="https://arxiv.org/format/2503.16867">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ETVA: Evaluation of Text-to-Video Alignment via Fine-grained Question Generation and Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+K">Kaisi Guan</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zhengfeng Lai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuchong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kieran Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+R">Ruihua Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16867v1-abstract-short" style="display: inline;"> Precisely evaluating semantic alignment between text prompts and generated videos remains a challenge in Text-to-Video (T2V) Generation. Existing text-to-video alignment metrics like CLIPScore only generate coarse-grained scores without fine-grained alignment details, failing to align with human preference. To address this limitation, we propose ETVA, a novel Evaluation method of Text-to-Video Ali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16867v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16867v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16867v1-abstract-full" style="display: none;"> Precisely evaluating semantic alignment between text prompts and generated videos remains a challenge in Text-to-Video (T2V) Generation. Existing text-to-video alignment metrics like CLIPScore only generate coarse-grained scores without fine-grained alignment details, failing to align with human preference. To address this limitation, we propose ETVA, a novel Evaluation method of Text-to-Video Alignment via fine-grained question generation and answering. First, a multi-agent system parses prompts into semantic scene graphs to generate atomic questions. Then we design a knowledge-augmented multi-stage reasoning framework for question answering, where an auxiliary LLM first retrieves relevant common-sense knowledge (e.g., physical laws), and then video LLM answers the generated questions through a multi-stage reasoning mechanism. Extensive experiments demonstrate that ETVA achieves a Spearman's correlation coefficient of 58.47, showing a much higher correlation with human judgment than existing metrics which attain only 31.0. We also construct a comprehensive benchmark specifically designed for text-to-video alignment evaluation, featuring 2k diverse prompts and 12k atomic questions spanning 10 categories. Through a systematic evaluation of 15 existing text-to-video models, we identify their key capabilities and limitations, paving the way for next-generation T2V generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16867v1-abstract-full').style.display = 'none'; document.getElementById('2503.16867v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16815">arXiv:2503.16815</a> <span> [<a href="https://arxiv.org/pdf/2503.16815">pdf</a>, <a href="https://arxiv.org/format/2503.16815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> DeFT: Mitigating Data Dependencies for Flexible Communication Scheduling in Distributed Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+L">Lin Meng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuzhong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16815v1-abstract-short" style="display: inline;"> Communication scheduling aims to reduce communication bottlenecks in data parallel training (DP) by maximizing the overlap between computation and communication. However, existing schemes fall short due to three main issues: (1) hard data dependencies break some overlapping between communication and computation; (2) high coverage rates impair further improvement on performance; (3) imbalanced comm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16815v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16815v1-abstract-full" style="display: none;"> Communication scheduling aims to reduce communication bottlenecks in data parallel training (DP) by maximizing the overlap between computation and communication. However, existing schemes fall short due to three main issues: (1) hard data dependencies break some overlapping between communication and computation; (2) high coverage rates impair further improvement on performance; (3) imbalanced communication/computation times of tensors caused by partitioning/fusion strategies cause more bubbles. To address these drawbacks, we propose a new communication scheduling scheme DeFT, whose key insight is to mitigate data dependencies and support flexible scheduling in distributed training. DeFT uncovers new overlapping chances in training by transforming the scheduling problem into multiple knapsack problems. Specifically, DeFT eliminates hard dependencies with delayed updates, reducing the coverage rate by adjusting update frequency and utilizing heterogeneous communication links, merging the computation times of backward or forward as the knapsack capacity to avoid the negative impact of unbalanced tensors. Additionally, DeFT preserves training accuracy by adjusting its scheduling strategy via convergence loss quantification. Extensive experiments with 16 A100 GPUs showed that DeFT achieved speedups of 29% to 115% on three representative benchmarks compared to US-Byte and Bytescheduler with no loss of accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16815v1-abstract-full').style.display = 'none'; document.getElementById('2503.16815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16755">arXiv:2503.16755</a> <span> [<a href="https://arxiv.org/pdf/2503.16755">pdf</a>, <a href="https://arxiv.org/format/2503.16755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Fast online node labeling with graph subsampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yushen Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+E">Ertai Luo</a>, <a href="/search/cs?searchtype=author&query=Babenezhad%2C+R">Reza Babenezhad</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yifan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16755v1-abstract-short" style="display: inline;"> Large data applications rely on storing data in massive, sparse graphs with millions to trillions of nodes. Graph-based methods, such as node prediction, aim for computational efficiency regardless of graph size. Techniques like localized approximate personalized page rank (APPR) solve sparse linear systems with complexity independent of graph size, but is in terms of the maximum node degree, whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16755v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16755v1-abstract-full" style="display: none;"> Large data applications rely on storing data in massive, sparse graphs with millions to trillions of nodes. Graph-based methods, such as node prediction, aim for computational efficiency regardless of graph size. Techniques like localized approximate personalized page rank (APPR) solve sparse linear systems with complexity independent of graph size, but is in terms of the maximum node degree, which can be much larger in practice than the average node degree for real-world large graphs. In this paper, we consider an \emph{online subsampled APPR method}, where messages are intentionally dropped at random. We use tools from graph sparsifiers and matrix linear algebra to give approximation bounds on the graph's spectral properties ($O(1/蔚^2)$ edges), and node classification performance (added $O(n蔚)$ overhead). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16755v1-abstract-full').style.display = 'none'; document.getElementById('2503.16755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16737">arXiv:2503.16737</a> <span> [<a href="https://arxiv.org/pdf/2503.16737">pdf</a>, <a href="https://arxiv.org/format/2503.16737">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> </div> </div> <p class="title is-5 mathjax"> Optimal Nonlinear Online Learning under Sequential Price Competition via s-Concavity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bracale%2C+D">Daniele Bracale</a>, <a href="/search/cs?searchtype=author&query=Banerjee%2C+M">Moulinath Banerjee</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Cong Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuekai Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16737v1-abstract-short" style="display: inline;"> We consider price competition among multiple sellers over a selling horizon of $T$ periods. In each period, sellers simultaneously offer their prices and subsequently observe their respective demand that is unobservable to competitors. The demand function for each seller depends on all sellers' prices through a private, unknown, and nonlinear relationship. To address this challenge, we propose a s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16737v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16737v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16737v1-abstract-full" style="display: none;"> We consider price competition among multiple sellers over a selling horizon of $T$ periods. In each period, sellers simultaneously offer their prices and subsequently observe their respective demand that is unobservable to competitors. The demand function for each seller depends on all sellers' prices through a private, unknown, and nonlinear relationship. To address this challenge, we propose a semi-parametric least-squares estimation of the nonlinear mean function, which does not require sellers to communicate demand information. We show that when all sellers employ our policy, their prices converge at a rate of $O(T^{-1/7})$ to the Nash equilibrium prices that sellers would reach if they were fully informed. Each seller incurs a regret of $O(T^{5/7})$ relative to a dynamic benchmark policy. A theoretical contribution of our work is proving the existence of equilibrium under shape-constrained demand functions via the concept of $s$-concavity and establishing regret bounds of our proposed policy. Technically, we also establish new concentration results for the least squares estimator under shape constraints. Our findings offer significant insights into dynamic competition-aware pricing and contribute to the broader study of non-parametric learning in strategic decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16737v1-abstract-full').style.display = 'none'; document.getElementById('2503.16737v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16550">arXiv:2503.16550</a> <span> [<a href="https://arxiv.org/pdf/2503.16550">pdf</a>, <a href="https://arxiv.org/format/2503.16550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unified Enhancement of the Generalization and Robustness of Language Models via Bi-Stage Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yudao Sun</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Juan Yin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Juan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongheng Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongji Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16550v1-abstract-short" style="display: inline;"> Neural network language models (LMs) are confronted with significant challenges in generalization and robustness. Currently, many studies focus on improving either generalization or robustness in isolation, without methods addressing both aspects simultaneously, which presents a significant challenge in developing LMs that are both robust and generalized. In this paper, we propose a bi-stage optim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16550v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16550v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16550v1-abstract-full" style="display: none;"> Neural network language models (LMs) are confronted with significant challenges in generalization and robustness. Currently, many studies focus on improving either generalization or robustness in isolation, without methods addressing both aspects simultaneously, which presents a significant challenge in developing LMs that are both robust and generalized. In this paper, we propose a bi-stage optimization framework to uniformly enhance both the generalization and robustness of LMs, termed UEGR. Specifically, during the forward propagation stage, we enrich the output probability distributions of adversarial samples by adaptive dropout to generate diverse sub models, and incorporate JS divergence and adversarial losses of these output distributions to reinforce output stability. During backward propagation stage, we compute parameter saliency scores and selectively update only the most critical parameters to minimize unnecessary deviations and consolidate the model's resilience. Theoretical analysis shows that our framework includes gradient regularization to limit the model's sensitivity to input perturbations and selective parameter updates to flatten the loss landscape, thus improving both generalization and robustness. The experimental results show that our method significantly improves the generalization and robustness of LMs compared to other existing methods across 13 publicly available language datasets, achieving state-of-the-art (SOTA) performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16550v1-abstract-full').style.display = 'none'; document.getElementById('2503.16550v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16544">arXiv:2503.16544</a> <span> [<a href="https://arxiv.org/pdf/2503.16544">pdf</a>, <a href="https://arxiv.org/format/2503.16544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Causal Discovery and Counterfactual Reasoning to Optimize Persuasive Dialogue Policies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+D">Donghuo Zeng</a>, <a href="/search/cs?searchtype=author&query=Legaspi%2C+R">Roberto Legaspi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuewen Sun</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xinshuai Dong</a>, <a href="/search/cs?searchtype=author&query=Ikeda%2C+K">Kazushi Ikeda</a>, <a href="/search/cs?searchtype=author&query=Spirtes%2C+P">Peter Spirtes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16544v1-abstract-short" style="display: inline;"> Tailoring persuasive conversations to users leads to more effective persuasion. However, existing dialogue systems often struggle to adapt to dynamically evolving user states. This paper presents a novel method that leverages causal discovery and counterfactual reasoning for optimizing system persuasion capability and outcomes. We employ the Greedy Relaxation of the Sparsest Permutation (GRaSP) al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16544v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16544v1-abstract-full" style="display: none;"> Tailoring persuasive conversations to users leads to more effective persuasion. However, existing dialogue systems often struggle to adapt to dynamically evolving user states. This paper presents a novel method that leverages causal discovery and counterfactual reasoning for optimizing system persuasion capability and outcomes. We employ the Greedy Relaxation of the Sparsest Permutation (GRaSP) algorithm to identify causal relationships between user and system utterance strategies, treating user strategies as states and system strategies as actions. GRaSP identifies user strategies as causal factors influencing system responses, which inform Bidirectional Conditional Generative Adversarial Networks (BiCoGAN) in generating counterfactual utterances for the system. Subsequently, we use the Dueling Double Deep Q-Network (D3QN) model to utilize counterfactual data to determine the best policy for selecting system utterances. Our experiments with the PersuasionForGood dataset show measurable improvements in persuasion outcomes using our approach over baseline methods. The observed increase in cumulative rewards and Q-values highlights the effectiveness of causal discovery in enhancing counterfactual reasoning and optimizing reinforcement learning policies for online dialogue systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16544v1-abstract-full').style.display = 'none'; document.getElementById('2503.16544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16402">arXiv:2503.16402</a> <span> [<a href="https://arxiv.org/pdf/2503.16402">pdf</a>, <a href="https://arxiv.org/format/2503.16402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Emperor's New Clothes in Benchmarking? A Rigorous Examination of Mitigation Strategies for LLM Benchmark Data Contamination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yifan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongbai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16402v1-abstract-short" style="display: inline;"> Benchmark Data Contamination (BDC)-the inclusion of benchmark testing samples in the training set-has raised increasing concerns in Large Language Model (LLM) evaluation, leading to falsely inflated performance estimates and undermining evaluation reliability. To address this, researchers have proposed various mitigation strategies to update existing benchmarks, including modifying original questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16402v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16402v1-abstract-full" style="display: none;"> Benchmark Data Contamination (BDC)-the inclusion of benchmark testing samples in the training set-has raised increasing concerns in Large Language Model (LLM) evaluation, leading to falsely inflated performance estimates and undermining evaluation reliability. To address this, researchers have proposed various mitigation strategies to update existing benchmarks, including modifying original questions or generating new ones based on them. However, a rigorous examination of the effectiveness of these mitigation strategies remains lacking. In this paper, we design a systematic and controlled pipeline along with two novel metrics-fidelity and contamination resistance-to provide a fine-grained and comprehensive assessment of existing BDC mitigation strategies. Previous assessment methods, such as accuracy drop and accuracy matching, focus solely on aggregate accuracy, often leading to incomplete or misleading conclusions. Our metrics address this limitation by emphasizing question-level evaluation result matching. Extensive experiments with 10 LLMs, 5 benchmarks, 20 BDC mitigation strategies, and 2 contamination scenarios reveal that no existing strategy significantly improves resistance over the vanilla case (i.e., no benchmark update) across all benchmarks, and none effectively balances fidelity and contamination resistance. These findings underscore the urgent need for designing more effective BDC mitigation strategies. Our code repository is available at https://github.com/ASTRAL-Group/BDC_mitigation_assessment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16402v1-abstract-full').style.display = 'none'; document.getElementById('2503.16402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16081">arXiv:2503.16081</a> <span> [<a href="https://arxiv.org/pdf/2503.16081">pdf</a>, <a href="https://arxiv.org/format/2503.16081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> OThink-MR1: Stimulating multimodal generalized reasoning capabilities via dynamic reinforcement learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuting Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Changwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16081v2-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have gained significant traction for their ability to process diverse input data types and generate coherent, contextually relevant outputs across various applications. While supervised fine-tuning (SFT) has been the predominant approach to enhance MLLM capabilities in task-specific optimization, it often falls short in fostering crucial generalized reasoni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16081v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16081v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16081v2-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have gained significant traction for their ability to process diverse input data types and generate coherent, contextually relevant outputs across various applications. While supervised fine-tuning (SFT) has been the predominant approach to enhance MLLM capabilities in task-specific optimization, it often falls short in fostering crucial generalized reasoning abilities. Although reinforcement learning (RL) holds great promise in overcoming these limitations, it encounters two significant challenges: (1) its generalized capacities in multimodal tasks remain largely unexplored, and (2) its training constraints, including the constant Kullback-Leibler divergence or the clamp strategy, often result in suboptimal bottlenecks. To address these challenges, we propose OThink-MR1, an advanced MLLM equipped with profound comprehension and reasoning capabilities across multimodal tasks. Specifically, we introduce Group Relative Policy Optimization with a dynamic Kullback-Leibler strategy (GRPO-D), which markedly enhances reinforcement learning (RL) performance. For Qwen2-VL-2B-Instruct, GRPO-D achieves a relative improvement of more than 5.72% over SFT and more than 13.59% over GRPO in same-task evaluation on two adapted datasets. Furthermore, GRPO-D demonstrates remarkable cross-task generalization capabilities, with an average relative improvement of more than 61.63% over SFT in cross-task evaluation. These results highlight that the MLLM trained with GRPO-D on one multimodal task can be effectively transferred to another task, underscoring the superior generalized reasoning capabilities of our proposed OThink-MR1 model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16081v2-abstract-full').style.display = 'none'; document.getElementById('2503.16081v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15426">arXiv:2503.15426</a> <span> [<a href="https://arxiv.org/pdf/2503.15426">pdf</a>, <a href="https://arxiv.org/format/2503.15426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Visual Position Prompt for MLLM based Visual Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanpeng Sun</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Q">Qinying Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zechao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15426v2-abstract-short" style="display: inline;"> Although Multimodal Large Language Models (MLLMs) excel at various image-related tasks, they encounter challenges in precisely aligning coordinates with spatial information within images, particularly in position-aware tasks such as visual grounding. This limitation arises from two key factors. First, MLLMs lack explicit spatial references, making it difficult to associate textual descriptions wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15426v2-abstract-full').style.display = 'inline'; document.getElementById('2503.15426v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15426v2-abstract-full" style="display: none;"> Although Multimodal Large Language Models (MLLMs) excel at various image-related tasks, they encounter challenges in precisely aligning coordinates with spatial information within images, particularly in position-aware tasks such as visual grounding. This limitation arises from two key factors. First, MLLMs lack explicit spatial references, making it difficult to associate textual descriptions with precise image locations. Second, their feature extraction processes prioritize global context over fine-grained spatial details, leading to weak localization capability. To address this issue, we introduce VPP-LLaVA, an MLLM equipped with Visual Position Prompt (VPP) to improve its grounding capability. VPP-LLaVA integrates two complementary mechanisms. The global VPP overlays learnable, axis-like embeddings onto the input image to provide structured spatial cues. The local VPP focuses on fine-grained localization by incorporating position-aware queries, which suggests probable object locations. We also introduce a VPP-SFT dataset with 0.6M samples, consolidating high-quality visual grounding data into a compact format for efficient model training. Training on this dataset with VPP enhances the model's performance, achieving state-of-the-art results on standard grounding benchmarks despite using fewer training samples compared to other MLLMs like MiniGPT-v2, which rely on much larger datasets ($\sim$21M samples). The code and VPP-SFT dataset will be available at https://github.com/WayneTomas/VPP-LLaVA upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15426v2-abstract-full').style.display = 'none'; document.getElementById('2503.15426v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15197">arXiv:2503.15197</a> <span> [<a href="https://arxiv.org/pdf/2503.15197">pdf</a>, <a href="https://arxiv.org/format/2503.15197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detect-and-Guide: Self-regulation of Diffusion Models for Safe Text-to-Image Generation via Guideline Token Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+F">Feifei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiming Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15197v1-abstract-short" style="display: inline;"> Text-to-image diffusion models have achieved state-of-the-art results in synthesis tasks; however, there is a growing concern about their potential misuse in creating harmful content. To mitigate these risks, post-hoc model intervention techniques, such as concept unlearning and safety guidance, have been developed. However, fine-tuning model weights or adapting the hidden states of the diffusion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15197v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15197v1-abstract-full" style="display: none;"> Text-to-image diffusion models have achieved state-of-the-art results in synthesis tasks; however, there is a growing concern about their potential misuse in creating harmful content. To mitigate these risks, post-hoc model intervention techniques, such as concept unlearning and safety guidance, have been developed. However, fine-tuning model weights or adapting the hidden states of the diffusion model operates in an uninterpretable way, making it unclear which part of the intermediate variables is responsible for unsafe generation. These interventions severely affect the sampling trajectory when erasing harmful concepts from complex, multi-concept prompts, thus hindering their practical use in real-world settings. In this work, we propose the safe generation framework Detect-and-Guide (DAG), leveraging the internal knowledge of diffusion models to perform self-diagnosis and fine-grained self-regulation during the sampling process. DAG first detects harmful concepts from noisy latents using refined cross-attention maps of optimized tokens, then applies safety guidance with adaptive strength and editing regions to negate unsafe generation. The optimization only requires a small annotated dataset and can provide precise detection maps with generalizability and concept specificity. Moreover, DAG does not require fine-tuning of diffusion models, and therefore introduces no loss to their generation diversity. Experiments on erasing sexual content show that DAG achieves state-of-the-art safe generation performance, balancing harmfulness mitigation and text-following performance on multi-concept real-world prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15197v1-abstract-full').style.display = 'none'; document.getElementById('2503.15197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14919">arXiv:2503.14919</a> <span> [<a href="https://arxiv.org/pdf/2503.14919">pdf</a>, <a href="https://arxiv.org/format/2503.14919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GenM$^3$: Generative Pretrained Multi-path Motion Model for Text Conditional Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Junyu Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lijiang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinni Zhou</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Q">Qiang Nie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14919v1-abstract-short" style="display: inline;"> Scaling up motion datasets is crucial to enhance motion generation capabilities. However, training on large-scale multi-source datasets introduces data heterogeneity challenges due to variations in motion content. To address this, we propose Generative Pretrained Multi-path Motion Model (GenM$^3$), a comprehensive framework designed to learn unified motion representations. GenM$^3$ comprises two c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14919v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14919v1-abstract-full" style="display: none;"> Scaling up motion datasets is crucial to enhance motion generation capabilities. However, training on large-scale multi-source datasets introduces data heterogeneity challenges due to variations in motion content. To address this, we propose Generative Pretrained Multi-path Motion Model (GenM$^3$), a comprehensive framework designed to learn unified motion representations. GenM$^3$ comprises two components: 1) a Multi-Expert VQ-VAE (MEVQ-VAE) that adapts to different dataset distributions to learn a unified discrete motion representation, and 2) a Multi-path Motion Transformer (MMT) that improves intra-modal representations by using separate modality-specific pathways, each with densely activated experts to accommodate variations within that modality, and improves inter-modal alignment by the text-motion shared pathway. To enable large-scale training, we integrate and unify 11 high-quality motion datasets (approximately 220 hours of motion data) and augment it with textual annotations (nearly 10,000 motion sequences labeled by a large language model and 300+ by human experts). After training on our integrated dataset, GenM$^3$ achieves a state-of-the-art FID of 0.035 on the HumanML3D benchmark, surpassing state-of-the-art methods by a large margin. It also demonstrates strong zero-shot generalization on IDEA400 dataset, highlighting its effectiveness and adaptability across diverse motion scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14919v1-abstract-full').style.display = 'none'; document.getElementById('2503.14919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14824">arXiv:2503.14824</a> <span> [<a href="https://arxiv.org/pdf/2503.14824">pdf</a>, <a href="https://arxiv.org/format/2503.14824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prototype Perturbation for Relaxing Alignment Constraints in Backward-Compatible Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zikun Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yushuai Sun</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+W">Wenjie Pei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14824v1-abstract-short" style="display: inline;"> The traditional paradigm to update retrieval models requires re-computing the embeddings of the gallery data, a time-consuming and computationally intensive process known as backfilling. To circumvent backfilling, Backward-Compatible Learning (BCL) has been widely explored, which aims to train a new model compatible with the old one. Many previous works focus on effectively aligning the embeddings… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14824v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14824v1-abstract-full" style="display: none;"> The traditional paradigm to update retrieval models requires re-computing the embeddings of the gallery data, a time-consuming and computationally intensive process known as backfilling. To circumvent backfilling, Backward-Compatible Learning (BCL) has been widely explored, which aims to train a new model compatible with the old one. Many previous works focus on effectively aligning the embeddings of the new model with those of the old one to enhance the backward-compatibility. Nevertheless, such strong alignment constraints would compromise the discriminative ability of the new model, particularly when different classes are closely clustered and hard to distinguish in the old feature space. To address this issue, we propose to relax the constraints by introducing perturbations to the old feature prototypes. This allows us to align the new feature space with a pseudo-old feature space defined by these perturbed prototypes, thereby preserving the discriminative ability of the new model in backward-compatible learning. We have developed two approaches for calculating the perturbations: Neighbor-Driven Prototype Perturbation (NDPP) and Optimization-Driven Prototype Perturbation (ODPP). Particularly, they take into account the feature distributions of not only the old but also the new models to obtain proper perturbations along with new model updating. Extensive experiments on the landmark and commodity datasets demonstrate that our approaches perform favorably against state-of-the-art BCL algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14824v1-abstract-full').style.display = 'none'; document.getElementById('2503.14824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14355">arXiv:2503.14355</a> <span> [<a href="https://arxiv.org/pdf/2503.14355">pdf</a>, <a href="https://arxiv.org/format/2503.14355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MAST-Pro: Dynamic Mixture-of-Experts for Adaptive Segmentation of Pan-Tumors with Knowledge-Driven Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+R">Runqi Meng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Sifan Song</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+P">Pengfei Jin</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+Y">Yujin Oh</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+L">Lin Teng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yulin Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiqun Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Quanzheng Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+N">Ning Guo</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+D">Dinggang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14355v1-abstract-short" style="display: inline;"> Accurate tumor segmentation is crucial for cancer diagnosis and treatment. While foundation models have advanced general-purpose segmentation, existing methods still struggle with: (1) limited incorporation of medical priors, (2) imbalance between generic and tumor-specific features, and (3) high computational costs for clinical adaptation. To address these challenges, we propose MAST-Pro (Mixture… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14355v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14355v1-abstract-full" style="display: none;"> Accurate tumor segmentation is crucial for cancer diagnosis and treatment. While foundation models have advanced general-purpose segmentation, existing methods still struggle with: (1) limited incorporation of medical priors, (2) imbalance between generic and tumor-specific features, and (3) high computational costs for clinical adaptation. To address these challenges, we propose MAST-Pro (Mixture-of-experts for Adaptive Segmentation of pan-Tumors with knowledge-driven Prompts), a novel framework that integrates dynamic Mixture-of-Experts (D-MoE) and knowledge-driven prompts for pan-tumor segmentation. Specifically, text and anatomical prompts provide domain-specific priors, guiding tumor representation learning, while D-MoE dynamically selects experts to balance generic and tumor-specific feature learning, improving segmentation accuracy across diverse tumor types. To enhance efficiency, we employ Parameter-Efficient Fine-Tuning (PEFT), optimizing MAST-Pro with significantly reduced computational overhead. Experiments on multi-anatomical tumor datasets demonstrate that MAST-Pro outperforms state-of-the-art approaches, achieving up to a 5.20% improvement in average DSC while reducing trainable parameters by 91.04%, without compromising accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14355v1-abstract-full').style.display = 'none'; document.getElementById('2503.14355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13139">arXiv:2503.13139</a> <span> [<a href="https://arxiv.org/pdf/2503.13139">pdf</a>, <a href="https://arxiv.org/format/2503.13139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Logic-in-Frames: Dynamic Keyframe Search via Visual Semantic-Logical Verification for Long Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+W">Weiyu Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaoguang Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jianxiang He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yijie Xu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jinhui Ye</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13139v1-abstract-short" style="display: inline;"> Understanding long video content is a complex endeavor that often relies on densely sampled frame captions or end-to-end feature selectors, yet these techniques commonly overlook the logical relationships between textual queries and visual elements. In practice, computational constraints necessitate coarse frame subsampling, a challenge analogous to ``finding a needle in a haystack.'' To address t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13139v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13139v1-abstract-full" style="display: none;"> Understanding long video content is a complex endeavor that often relies on densely sampled frame captions or end-to-end feature selectors, yet these techniques commonly overlook the logical relationships between textual queries and visual elements. In practice, computational constraints necessitate coarse frame subsampling, a challenge analogous to ``finding a needle in a haystack.'' To address this issue, we introduce a semantics-driven search framework that reformulates keyframe selection under the paradigm of Visual Semantic-Logical Search. Specifically, we systematically define four fundamental logical dependencies: 1) spatial co-occurrence, 2) temporal proximity, 3) attribute dependency, and 4) causal order. These relations dynamically update frame sampling distributions through an iterative refinement process, enabling context-aware identification of semantically critical frames tailored to specific query requirements. Our method establishes new SOTA performance on the manually annotated benchmark in key-frame selection metrics. Furthermore, when applied to downstream video question-answering tasks, the proposed approach demonstrates the best performance gains over existing methods on LongVideoBench and Video-MME, validating its effectiveness in bridging the logical gap between textual queries and visual-temporal reasoning. The code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13139v1-abstract-full').style.display = 'none'; document.getElementById('2503.13139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12978">arXiv:2503.12978</a> <span> [<a href="https://arxiv.org/pdf/2503.12978">pdf</a>, <a href="https://arxiv.org/format/2503.12978">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Job Salary Prediction with Disentangled Composition Effect Modeling: A Neural Prototyping Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yang Ji</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hengshu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12978v2-abstract-short" style="display: inline;"> In the era of the knowledge economy, understanding how job skills influence salary is crucial for promoting recruitment with competitive salary systems and aligned salary expectations. Despite efforts on salary prediction based on job positions and talent demographics, there still lacks methods to effectively discern the set-structured skills' intricate composition effect on job salary. While rece… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12978v2-abstract-full').style.display = 'inline'; document.getElementById('2503.12978v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12978v2-abstract-full" style="display: none;"> In the era of the knowledge economy, understanding how job skills influence salary is crucial for promoting recruitment with competitive salary systems and aligned salary expectations. Despite efforts on salary prediction based on job positions and talent demographics, there still lacks methods to effectively discern the set-structured skills' intricate composition effect on job salary. While recent advances in neural networks have significantly improved accurate set-based quantitative modeling, their lack of explainability hinders obtaining insights into the skills' composition effects. Indeed, model explanation for set data is challenging due to the combinatorial nature, rich semantics, and unique format. To this end, in this paper, we propose a novel intrinsically explainable set-based neural prototyping approach, namely \textbf{LGDESetNet}, for explainable salary prediction that can reveal disentangled skill sets that impact salary from both local and global perspectives. Specifically, we propose a skill graph-enhanced disentangled discrete subset selection layer to identify multi-faceted influential input subsets with varied semantics. Furthermore, we propose a set-oriented prototype learning method to extract globally influential prototypical sets. The resulting output is transparently derived from the semantic interplay between these input subsets and global prototypes. Extensive experiments on four real-world datasets demonstrate that our method achieves superior performance than state-of-the-art baselines in salary prediction while providing explainable insights into salary-influencing patterns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12978v2-abstract-full').style.display = 'none'; document.getElementById('2503.12978v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12782">arXiv:2503.12782</a> <span> [<a href="https://arxiv.org/pdf/2503.12782">pdf</a>, <a href="https://arxiv.org/format/2503.12782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DART: Dual-level Autonomous Robotic Topology for Efficient Exploration in Unknown Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiming Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yulong Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiongwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yijiao Sun</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xiangyan Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12782v1-abstract-short" style="display: inline;"> Conventional algorithms in autonomous exploration face challenges due to their inability to accurately and efficiently identify the spatial distribution of convex regions in the real-time map. These methods often prioritize navigation toward the nearest or information-rich frontiers -- the boundaries between known and unknown areas -- resulting in incomplete convex region exploration and requiring… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12782v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12782v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12782v1-abstract-full" style="display: none;"> Conventional algorithms in autonomous exploration face challenges due to their inability to accurately and efficiently identify the spatial distribution of convex regions in the real-time map. These methods often prioritize navigation toward the nearest or information-rich frontiers -- the boundaries between known and unknown areas -- resulting in incomplete convex region exploration and requiring excessive backtracking to revisit these missed areas. To address these limitations, this paper introduces an innovative dual-level topological analysis approach. First, we introduce a Low-level Topological Graph (LTG), generated through uniform sampling of the original map data, which captures essential geometric and connectivity details. Next, the LTG is transformed into a High-level Topological Graph (HTG), representing the spatial layout and exploration completeness of convex regions, prioritizing the exploration of convex regions that are not fully explored and minimizing unnecessary backtracking. Finally, an novel Local Artificial Potential Field (LAPF) method is employed for motion control, replacing conventional path planning and boosting overall efficiency. Experimental results highlight the effectiveness of our approach. Simulation tests reveal that our framework significantly reduces exploration time and travel distance, outperforming existing methods in both speed and efficiency. Ablation studies confirm the critical role of each framework component. Real-world tests demonstrate the robustness of our method in environments with poor mapping quality, surpassing other approaches in adaptability to mapping inaccuracies and inaccessible areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12782v1-abstract-full').style.display = 'none'; document.getElementById('2503.12782v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures, Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12600">arXiv:2503.12600</a> <span> [<a href="https://arxiv.org/pdf/2503.12600">pdf</a>, <a href="https://arxiv.org/format/2503.12600">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GraphEval: A Lightweight Graph-Based LLM Framework for Idea Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yihang Sun</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12600v1-abstract-short" style="display: inline;"> The powerful capabilities of Large Language Models (LLMs) have led to their growing use in evaluating human-generated content, particularly in evaluating research ideas within academic settings. Existing solutions primarily rely on prompt-based LLM methods or fine-tuned lightweight language models for idea evaluation. However, these methods are often unstable and struggle to comprehend the complex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12600v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12600v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12600v1-abstract-full" style="display: none;"> The powerful capabilities of Large Language Models (LLMs) have led to their growing use in evaluating human-generated content, particularly in evaluating research ideas within academic settings. Existing solutions primarily rely on prompt-based LLM methods or fine-tuned lightweight language models for idea evaluation. However, these methods are often unstable and struggle to comprehend the complex semantic information embedded in the ideas, impeding their ability to perform high-quality evaluations. To address the above challenges, we propose GraphEval, a lightweight graph-based LLM framework for idea evaluation. Our insight is that a complex idea can be broken down into comprehensible viewpoint nodes using prompts from small LLMs. These viewpoint nodes can then be linked together through edges created from LLM-based relation extraction and/or BERT similarity scores. The created viewpoint-graph can be used to conveniently propagate scores across view-nodes to improve the robustness of the idea evaluations. In particular, we propose two lightweight graph-based methods for idea evaluation: (1) GraphEval-LP: a training-free label propagation algorithm that propagates evaluation scores from known view-nodes to unknown nodes; (2) GraphEval-GNN: a Graph Neural Networks (GNN) that is trained to predict the evaluation scores given the observed graph with minimal computation resources. Moreover, to overcome LLM's limitation in objectively assessing the novelty of ideas, we further propose a novelty detection model to GraphEval-GNN to enhance its capability in judging idea novelty. Experiments on two datasets show GraphEval improves F1 scores by at least 14% with low computation and API costs. Additionally, GraphEval can effectively detect plagiarized ideas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12600v1-abstract-full').style.display = 'none'; document.getElementById('2503.12600v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12547">arXiv:2503.12547</a> <span> [<a href="https://arxiv.org/pdf/2503.12547">pdf</a>, <a href="https://arxiv.org/format/2503.12547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLMSeR: Enhancing Sequential Recommendation via LLM-based Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuqi Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qidong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haiping Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12547v2-abstract-short" style="display: inline;"> Sequential Recommender Systems (SRS) have become a cornerstone of online platforms, leveraging users' historical interaction data to forecast their next potential engagement. Despite their widespread adoption, SRS often grapple with the long-tail user dilemma, resulting in less effective recommendations for individuals with limited interaction records. The advent of Large Language Models (LLMs), w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12547v2-abstract-full').style.display = 'inline'; document.getElementById('2503.12547v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12547v2-abstract-full" style="display: none;"> Sequential Recommender Systems (SRS) have become a cornerstone of online platforms, leveraging users' historical interaction data to forecast their next potential engagement. Despite their widespread adoption, SRS often grapple with the long-tail user dilemma, resulting in less effective recommendations for individuals with limited interaction records. The advent of Large Language Models (LLMs), with their profound capability to discern semantic relationships among items, has opened new avenues for enhancing SRS through data augmentation. Nonetheless, current methodologies encounter obstacles, including the absence of collaborative signals and the prevalence of hallucination phenomena. In this work, we present LLMSeR, an innovative framework that utilizes Large Language Models (LLMs) to generate pseudo-prior items, thereby improving the efficacy of Sequential Recommender Systems (SRS). To alleviate the challenge of insufficient collaborative signals, we introduce the Semantic Interaction Augmentor (SIA), a method that integrates both semantic and collaborative information to comprehensively augment user interaction data. Moreover, to weaken the adverse effects of hallucination in SRS, we develop the Adaptive Reliability Validation (ARV), a validation technique designed to assess the reliability of the generated pseudo items. Complementing these advancements, we also devise a Dual-Channel Training strategy, ensuring seamless integration of data augmentation into the SRS training process.Extensive experiments conducted with three widely-used SRS models demonstrate the generalizability and efficacy of LLMSeR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12547v2-abstract-full').style.display = 'none'; document.getElementById('2503.12547v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11780">arXiv:2503.11780</a> <span> [<a href="https://arxiv.org/pdf/2503.11780">pdf</a>, <a href="https://arxiv.org/format/2503.11780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Multi-modal Object Detection from the Perspective of Mono-Modality Feature Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tianyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boyang Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yanglei Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiming Sun</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Maoxun Yuan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingxing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11780v1-abstract-short" style="display: inline;"> Multi-Modal Object Detection (MMOD), due to its stronger adaptability to various complex environments, has been widely applied in various applications. Extensive research is dedicated to the RGB-IR object detection, primarily focusing on how to integrate complementary features from RGB-IR modalities. However, they neglect the mono-modality insufficient learning problem that the decreased feature e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11780v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11780v1-abstract-full" style="display: none;"> Multi-Modal Object Detection (MMOD), due to its stronger adaptability to various complex environments, has been widely applied in various applications. Extensive research is dedicated to the RGB-IR object detection, primarily focusing on how to integrate complementary features from RGB-IR modalities. However, they neglect the mono-modality insufficient learning problem that the decreased feature extraction capability in multi-modal joint learning. This leads to an unreasonable but prevalent phenomenon--Fusion Degradation, which hinders the performance improvement of the MMOD model. Motivated by this, in this paper, we introduce linear probing evaluation to the multi-modal detectors and rethink the multi-modal object detection task from the mono-modality learning perspective. Therefore, we construct an novel framework called M$^2$D-LIF, which consists of the Mono-Modality Distillation (M$^2$D) method and the Local Illumination-aware Fusion (LIF) module. The M$^2$D-LIF framework facilitates the sufficient learning of mono-modality during multi-modal joint training and explores a lightweight yet effective feature fusion manner to achieve superior object detection performance. Extensive experiments conducted on three MMOD datasets demonstrate that our M$^2$D-LIF effectively mitigates the Fusion Degradation phenomenon and outperforms the previous SOTA detectors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11780v1-abstract-full').style.display = 'none'; document.getElementById('2503.11780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11347">arXiv:2503.11347</a> <span> [<a href="https://arxiv.org/pdf/2503.11347">pdf</a>, <a href="https://arxiv.org/format/2503.11347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biological Physics">physics.bio-ph</span> </div> </div> <p class="title is-5 mathjax"> Integrating Dynamical Systems Modeling with Spatiotemporal scRNA-seq Data Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuhao Sun</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qiangwei Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tiejun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Peijie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11347v1-abstract-short" style="display: inline;"> Understanding the dynamic nature of biological systems is fundamental to deciphering cellular behavior, developmental processes, and disease progression. Single-cell RNA sequencing (scRNA-seq) has provided static snapshots of gene expression, offering valuable insights into cellular states at a single time point. Recent advancements in temporally resolved scRNA-seq, spatial transcriptomics (ST), a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11347v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11347v1-abstract-full" style="display: none;"> Understanding the dynamic nature of biological systems is fundamental to deciphering cellular behavior, developmental processes, and disease progression. Single-cell RNA sequencing (scRNA-seq) has provided static snapshots of gene expression, offering valuable insights into cellular states at a single time point. Recent advancements in temporally resolved scRNA-seq, spatial transcriptomics (ST), and time-series spatial transcriptomics (temporal-ST) have further revolutionized our ability to study the spatiotemporal dynamics of individual cells. These technologies, when combined with computational frameworks such as Markov chains, stochastic differential equations (SDEs), and generative models like optimal transport and Schr枚dinger bridges, enable the reconstruction of dynamic cellular trajectories and cell fate decisions. This review discusses how these dynamical system approaches offer new opportunities to model and infer cellular dynamics from a systematic perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11347v1-abstract-full').style.display = 'none'; document.getElementById('2503.11347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11224">arXiv:2503.11224</a> <span> [<a href="https://arxiv.org/pdf/2503.11224">pdf</a>, <a href="https://arxiv.org/format/2503.11224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Technologies on Effectiveness and Efficiency: A Survey of State Spaces Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Youbang Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+S">Shang Qu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuekai Zhu</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yi Wu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&query=Long%2C+X">Xinwei Long</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11224v1-abstract-short" style="display: inline;"> State Space Models (SSMs) have emerged as a promising alternative to the popular transformer-based models and have been increasingly gaining attention. Compared to transformers, SSMs excel at tasks with sequential data or longer contexts, demonstrating comparable performances with significant efficiency gains. In this survey, we provide a coherent and systematic overview for SSMs, including their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11224v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11224v1-abstract-full" style="display: none;"> State Space Models (SSMs) have emerged as a promising alternative to the popular transformer-based models and have been increasingly gaining attention. Compared to transformers, SSMs excel at tasks with sequential data or longer contexts, demonstrating comparable performances with significant efficiency gains. In this survey, we provide a coherent and systematic overview for SSMs, including their theoretical motivations, mathematical formulations, comparison with existing model classes, and various applications. We divide the SSM series into three main sections, providing a detailed introduction to the original SSM, the structured SSM represented by S4, and the selective SSM typified by Mamba. We put an emphasis on technicality, and highlight the various key techniques introduced to address the effectiveness and efficiency of SSMs. We hope this manuscript serves as an introduction for researchers to explore the theoretical foundations of SSMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11224v1-abstract-full').style.display = 'none'; document.getElementById('2503.11224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11043">arXiv:2503.11043</a> <span> [<a href="https://arxiv.org/pdf/2503.11043">pdf</a>, <a href="https://arxiv.org/format/2503.11043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InverseBench: Benchmarking Plug-and-Play Diffusion Priors for Inverse Problems in Physical Sciences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hongkai Zheng</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+W">Wenda Chu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bingliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihui Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Austin Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Caifeng Zou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yu Sun</a>, <a href="/search/cs?searchtype=author&query=Kovachki%2C+N">Nikola Kovachki</a>, <a href="/search/cs?searchtype=author&query=Ross%2C+Z+E">Zachary E. Ross</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+Y">Yisong Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11043v1-abstract-short" style="display: inline;"> Plug-and-play diffusion priors (PnPDP) have emerged as a promising research direction for solving inverse problems. However, current studies primarily focus on natural image restoration, leaving the performance of these algorithms in scientific inverse problems largely unexplored. To address this gap, we introduce \textsc{InverseBench}, a framework that evaluates diffusion models across five dis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11043v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11043v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11043v1-abstract-full" style="display: none;"> Plug-and-play diffusion priors (PnPDP) have emerged as a promising research direction for solving inverse problems. However, current studies primarily focus on natural image restoration, leaving the performance of these algorithms in scientific inverse problems largely unexplored. To address this gap, we introduce \textsc{InverseBench}, a framework that evaluates diffusion models across five distinct scientific inverse problems. These problems present unique structural challenges that differ from existing benchmarks, arising from critical scientific applications such as optical tomography, medical imaging, black hole imaging, seismology, and fluid dynamics. With \textsc{InverseBench}, we benchmark 14 inverse problem algorithms that use plug-and-play diffusion priors against strong, domain-specific baselines, offering valuable new insights into the strengths and weaknesses of existing algorithms. To facilitate further research and development, we open-source the codebase, along with datasets and pre-trained models, at https://devzhk.github.io/InverseBench/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11043v1-abstract-full').style.display = 'none'; document.getElementById('2503.11043v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Sun%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository