CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 6,101 results for author: <span class="mathjax">Wang, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Wang%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Wang, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Wang%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Wang, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21772">arXiv:2503.21772</a> <span> [<a href="https://arxiv.org/pdf/2503.21772">pdf</a>, <a href="https://arxiv.org/format/2503.21772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LOCORE: Image Re-ranking with Long-Context Sequence Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zilin Xiao</a>, <a href="/search/cs?searchtype=author&query=Suma%2C+P">Pavel Suma</a>, <a href="/search/cs?searchtype=author&query=Sachdeva%2C+A">Ayush Sachdeva</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao-Jen Wang</a>, <a href="/search/cs?searchtype=author&query=Kordopatis-Zilos%2C+G">Giorgos Kordopatis-Zilos</a>, <a href="/search/cs?searchtype=author&query=Tolias%2C+G">Giorgos Tolias</a>, <a href="/search/cs?searchtype=author&query=Ordonez%2C+V">Vicente Ordonez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21772v1-abstract-short" style="display: inline;"> We introduce LOCORE, Long-Context Re-ranker, a model that takes as input local descriptors corresponding to an image query and a list of gallery images and outputs similarity scores between the query and each gallery image. This model is used for image retrieval, where typically a first ranking is performed with an efficient similarity measure, and then a shortlist of top-ranked images is re-ranke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21772v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21772v1-abstract-full" style="display: none;"> We introduce LOCORE, Long-Context Re-ranker, a model that takes as input local descriptors corresponding to an image query and a list of gallery images and outputs similarity scores between the query and each gallery image. This model is used for image retrieval, where typically a first ranking is performed with an efficient similarity measure, and then a shortlist of top-ranked images is re-ranked based on a more fine-grained similarity measure. Compared to existing methods that perform pair-wise similarity estimation with local descriptors or list-wise re-ranking with global descriptors, LOCORE is the first method to perform list-wise re-ranking with local descriptors. To achieve this, we leverage efficient long-context sequence models to effectively capture the dependencies between query and gallery images at the local-descriptor level. During testing, we process long shortlists with a sliding window strategy that is tailored to overcome the context size limitations of sequence models. Our approach achieves superior performance compared with other re-rankers on established image retrieval benchmarks of landmarks (ROxf and RPar), products (SOP), fashion items (In-Shop), and bird species (CUB-200) while having comparable latency to the pair-wise local descriptor re-rankers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21772v1-abstract-full').style.display = 'none'; document.getElementById('2503.21772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21683">arXiv:2503.21683</a> <span> [<a href="https://arxiv.org/pdf/2503.21683">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-Gomoku: A Large Language Model-Based System for Strategic Gomoku with Self-Play and Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21683v1-abstract-short" style="display: inline;"> In recent years, large language models (LLMs) have shown significant advancements in natural language processing (NLP), with strong capa-bilities in generation, comprehension, and rea-soning. These models have found applications in education, intelligent decision-making, and gaming. However, effectively utilizing LLMs for strategic planning and decision-making in the game of Gomoku remains a chall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21683v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21683v1-abstract-full" style="display: none;"> In recent years, large language models (LLMs) have shown significant advancements in natural language processing (NLP), with strong capa-bilities in generation, comprehension, and rea-soning. These models have found applications in education, intelligent decision-making, and gaming. However, effectively utilizing LLMs for strategic planning and decision-making in the game of Gomoku remains a challenge. This study aims to develop a Gomoku AI system based on LLMs, simulating the human learning process of playing chess. The system is de-signed to understand and apply Gomoku strat-egies and logic to make rational decisions. The research methods include enabling the model to "read the board," "understand the rules," "select strategies," and "evaluate positions," while en-hancing its abilities through self-play and rein-forcement learning. The results demonstrate that this approach significantly improves the se-lection of move positions, resolves the issue of generating illegal positions, and reduces pro-cess time through parallel position evaluation. After extensive self-play training, the model's Gomoku-playing capabilities have been notably enhanced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21683v1-abstract-full').style.display = 'none'; document.getElementById('2503.21683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21620">arXiv:2503.21620</a> <span> [<a href="https://arxiv.org/pdf/2503.21620">pdf</a>, <a href="https://arxiv.org/format/2503.21620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UI-R1: Enhancing Action Prediction of GUI Agents by Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhengxi Lu</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Y">Yuxiang Chai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yaxuan Guo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xi Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guanjing Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21620v1-abstract-short" style="display: inline;"> The recent DeepSeek-R1 has showcased the emergence of reasoning capabilities in LLMs through reinforcement learning (RL) with rule-based rewards. Building on this idea, we are the first to explore how rule-based RL can enhance the reasoning capabilities of multimodal large language models (MLLMs) for graphic user interface (GUI) action prediction tasks. To this end, we curate a small yet high-qual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21620v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21620v1-abstract-full" style="display: none;"> The recent DeepSeek-R1 has showcased the emergence of reasoning capabilities in LLMs through reinforcement learning (RL) with rule-based rewards. Building on this idea, we are the first to explore how rule-based RL can enhance the reasoning capabilities of multimodal large language models (MLLMs) for graphic user interface (GUI) action prediction tasks. To this end, we curate a small yet high-quality dataset of 136 challenging tasks, encompassing five common action types on mobile devices. We also introduce a unified rule-based action reward, enabling model optimization via policy-based algorithms such as Group Relative Policy Optimization (GRPO). Experimental results demonstrate that our proposed data-efficient model, UI-R1-3B, achieves substantial improvements on both in-domain (ID) and out-of-domain (OOD) tasks. Specifically, on the ID benchmark AndroidControl, the action type accuracy improves by 15%, while grounding accuracy increases by 10.3%, compared with the base model (i.e. Qwen2.5-VL-3B). On the OOD GUI grounding benchmark ScreenSpot-Pro, our model surpasses the base model by 6.0% and achieves competitive performance with larger models (e.g., OS-Atlas-7B), which are trained via supervised fine-tuning (SFT) on 76K data. These results underscore the potential of rule-based reinforcement learning to advance GUI understanding and control, paving the way for future research in this domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21620v1-abstract-full').style.display = 'none'; document.getElementById('2503.21620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21585">arXiv:2503.21585</a> <span> [<a href="https://arxiv.org/pdf/2503.21585">pdf</a>, <a href="https://arxiv.org/format/2503.21585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Probabilistic Functional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haixu Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiguo Cao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21585v1-abstract-short" style="display: inline;"> High-dimensional functional time series (HDFTS) are often characterized by nonlinear trends and high spatial dimensions. Such data poses unique challenges for modeling and forecasting due to the nonlinearity, nonstationarity, and high dimensionality. We propose a novel probabilistic functional neural network (ProFnet) to address these challenges. ProFnet integrates the strengths of feedforward and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21585v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21585v1-abstract-full" style="display: none;"> High-dimensional functional time series (HDFTS) are often characterized by nonlinear trends and high spatial dimensions. Such data poses unique challenges for modeling and forecasting due to the nonlinearity, nonstationarity, and high dimensionality. We propose a novel probabilistic functional neural network (ProFnet) to address these challenges. ProFnet integrates the strengths of feedforward and deep neural networks with probabilistic modeling. The model generates probabilistic forecasts using Monte Carlo sampling and also enables the quantification of uncertainty in predictions. While capturing both temporal and spatial dependencies across multiple regions, ProFnet offers a scalable and unified solution for large datasets. Applications to Japan's mortality rates demonstrate superior performance. This approach enhances predictive accuracy and provides interpretable uncertainty estimates, making it a valuable tool for forecasting complex high-dimensional functional data and HDFTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21585v1-abstract-full').style.display = 'none'; document.getElementById('2503.21585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.21240">arXiv:2503.21240</a> <span> [<a href="https://arxiv.org/pdf/2503.21240">pdf</a>, <a href="https://arxiv.org/format/2503.21240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> The Promise and Pitfalls of WebAssembly: Perspectives from the Industry </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+N">Ningyu He</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shangtong Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yao Guo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiapu Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.21240v1-abstract-short" style="display: inline;"> As JavaScript has been criticized for performance and security issues in web applications, WebAssembly (Wasm) was proposed in 2017 and is regarded as the complementation for JavaScript. Due to its advantages like compact-size, native-like speed, and portability, Wasm binaries are gradually used as the compilation target for industrial projects in other high-level programming languages and are resp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21240v1-abstract-full').style.display = 'inline'; document.getElementById('2503.21240v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.21240v1-abstract-full" style="display: none;"> As JavaScript has been criticized for performance and security issues in web applications, WebAssembly (Wasm) was proposed in 2017 and is regarded as the complementation for JavaScript. Due to its advantages like compact-size, native-like speed, and portability, Wasm binaries are gradually used as the compilation target for industrial projects in other high-level programming languages and are responsible for computation-intensive tasks in browsers, e.g., 3D graphic rendering and video decoding. Intuitively, characterizing in-the-wild adopted Wasm binaries from different perspectives, like their metadata, relation with source programming language, existence of security threats, and practical purpose, is the prerequisite before delving deeper into the Wasm ecosystem and beneficial to its roadmap selection. However, currently, there is no work that conducts a large-scale measurement study on in-the-wild adopted Wasm binaries. To fill this gap, we collect the largest-ever dataset to the best of our knowledge, and characterize the status quo of them from industry perspectives. According to the different roles of people engaging in the community, i.e., web developers, Wasm maintainers, and researchers, we reorganized our findings to suggestions and best practices for them accordingly. We believe this work can shed light on the future direction of the web and Wasm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.21240v1-abstract-full').style.display = 'none'; document.getElementById('2503.21240v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by FSE'25 Industry Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20768">arXiv:2503.20768</a> <span> [<a href="https://arxiv.org/pdf/2503.20768">pdf</a>, <a href="https://arxiv.org/format/2503.20768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of the Impact of Federated Learning on Machine Learning Model Accuracy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haotian Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoran Wang</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+B">Benson Chou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Sophie Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingxian Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qizhen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20768v2-abstract-short" style="display: inline;"> Federated Learning (FL) enables distributed ML model training on private user data at the global scale. Despite the potential of FL demonstrated in many domains, an in-depth view of its impact on model accuracy remains unclear. In this paper, we investigate, systematically, how this learning paradigm can affect the accuracy of state-of-the-art ML models for a variety of ML tasks. We present an emp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20768v2-abstract-full').style.display = 'inline'; document.getElementById('2503.20768v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20768v2-abstract-full" style="display: none;"> Federated Learning (FL) enables distributed ML model training on private user data at the global scale. Despite the potential of FL demonstrated in many domains, an in-depth view of its impact on model accuracy remains unclear. In this paper, we investigate, systematically, how this learning paradigm can affect the accuracy of state-of-the-art ML models for a variety of ML tasks. We present an empirical study that involves various data types: text, image, audio, and video, and FL configuration knobs: data distribution, FL scale, client sampling, and local and global computations. Our experiments are conducted in a unified FL framework to achieve high fidelity, with substantial human efforts and resource investments. Based on the results, we perform a quantitative analysis of the impact of FL, and highlight challenging scenarios where applying FL degrades the accuracy of the model drastically and identify cases where the impact is negligible. The detailed and extensive findings can benefit practical deployments and future development of FL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20768v2-abstract-full').style.display = 'none'; document.getElementById('2503.20768v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.2.4; I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20680">arXiv:2503.20680</a> <span> [<a href="https://arxiv.org/pdf/2503.20680">pdf</a>, <a href="https://arxiv.org/format/2503.20680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Vision as LoRA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yongjie Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bingru Li</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yuxiang Nie</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinghui Lu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jingqun Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanjie Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Can Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20680v1-abstract-short" style="display: inline;"> We introduce Vision as LoRA (VoRA), a novel paradigm for transforming an LLM into an MLLM. Unlike prevalent MLLM architectures that rely on external vision modules for vision encoding, VoRA internalizes visual capabilities by integrating vision-specific LoRA layers directly into the LLM. This design allows the added parameters to be seamlessly merged into the LLM during inference, eliminating stru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20680v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20680v1-abstract-full" style="display: none;"> We introduce Vision as LoRA (VoRA), a novel paradigm for transforming an LLM into an MLLM. Unlike prevalent MLLM architectures that rely on external vision modules for vision encoding, VoRA internalizes visual capabilities by integrating vision-specific LoRA layers directly into the LLM. This design allows the added parameters to be seamlessly merged into the LLM during inference, eliminating structural complexity and minimizing computational overhead. Moreover, inheriting the LLM's ability of handling flexible context, VoRA can process inputs at arbitrary resolutions. To further strengthen VoRA's visual capabilities, we introduce a block-wise distillation method that transfers visual priors from a pre-trained ViT into the LoRA layers, effectively accelerating training by injecting visual knowledge. Additionally, we apply bi-directional attention masks to better capture the context information of an image. We successfully demonstrate that with additional pre-training data, VoRA can perform comparably with conventional encode-based MLLMs. All training data, codes, and model weights will be released at https://github.com/Hon-Wong/VoRA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20680v1-abstract-full').style.display = 'none'; document.getElementById('2503.20680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20591">arXiv:2503.20591</a> <span> [<a href="https://arxiv.org/pdf/2503.20591">pdf</a>, <a href="https://arxiv.org/format/2503.20591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> NotebookOS: A Notebook Operating System for Interactive Training with On-Demand GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Carver%2C+B">Benjamin Carver</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoliang Wang</a>, <a href="/search/cs?searchtype=author&query=Mahadik%2C+K">Kanak Mahadik</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yue Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20591v1-abstract-short" style="display: inline;"> Interactive notebook programming is universal in modern ML (machine learning) and AI (artificial intelligence) workflows. Notebook software like Jupyter and Google Colab provides a user-friendly, interactive, web-based programming interface and is widely used across science and engineering domains. A dominant application of production notebook workloads is interactive deep learning training (IDLT)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20591v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20591v1-abstract-full" style="display: none;"> Interactive notebook programming is universal in modern ML (machine learning) and AI (artificial intelligence) workflows. Notebook software like Jupyter and Google Colab provides a user-friendly, interactive, web-based programming interface and is widely used across science and engineering domains. A dominant application of production notebook workloads is interactive deep learning training (IDLT). To guarantee high interactivity, modern notebook platforms typically reserve GPU resources within actively running notebook sessions. These notebook sessions are long-running but exhibit intermittent and sporadic GPU usage. Consequently, during most of their lifetimes, notebook sessions do not use the reserved GPUs, resulting in extremely low GPU utilization and prohibitively high cost. In this paper, we introduce NotebookOS, a GPU-efficient notebook platform designed to meet the unique requirements of IDLT. NotebookOS uses a replicated notebook kernel design, where each kernel consists of three replicas distributed across separate GPU servers and synchronized via Raft. To optimize GPU utilization, NotebookOS oversubscribes server resources via kernel replication to leverage the relatively high task inter-arrival times in IDLT workloads. By dynamically allocating GPUs to kernel replicas only while they are actively executing notebook cells, NotebookOS maximizes the likelihood of immediate and interactive training upon notebook notebook-cell task submission. NotebookOS also migrates kernel replicas and automatically scales the GPU cluster under overload conditions. We evaluate NotebookOS extensively using production notebook workloads. Evaluation results show that NotebookOS saves 1,187+ GPU hours over a 17.5-hour real-world IDLT workload while greatly enhancing interactivity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20591v1-abstract-full').style.display = 'none'; document.getElementById('2503.20591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> C.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20527">arXiv:2503.20527</a> <span> [<a href="https://arxiv.org/pdf/2503.20527">pdf</a>, <a href="https://arxiv.org/format/2503.20527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StableToolBench-MirrorAPI: Modeling Tool Environments as Mirrors of 7,000+ Real-World APIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhicheng Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sijie Cheng</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yuchen Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sicheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20527v1-abstract-short" style="display: inline;"> The rapid advancement of large language models (LLMs) has spurred significant interest in tool learning, where LLMs are augmented with external tools to tackle complex tasks. However, existing tool environments face challenges in balancing stability, scalability, and realness, particularly for benchmarking purposes. To address this problem, we propose MirrorAPI, a novel framework that trains speci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20527v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20527v1-abstract-full" style="display: none;"> The rapid advancement of large language models (LLMs) has spurred significant interest in tool learning, where LLMs are augmented with external tools to tackle complex tasks. However, existing tool environments face challenges in balancing stability, scalability, and realness, particularly for benchmarking purposes. To address this problem, we propose MirrorAPI, a novel framework that trains specialized LLMs to accurately simulate real API responses, effectively acting as "mirrors" to tool environments. Using a comprehensive dataset of request-response pairs from 7,000+ APIs, we employ supervised fine-tuning and chain-of-thought reasoning to enhance simulation fidelity. MirrorAPI achieves superior accuracy and stability compared to state-of-the-art methods, as demonstrated by its performance on the newly constructed MirrorAPI-Bench and its integration into StableToolBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20527v1-abstract-full').style.display = 'none'; document.getElementById('2503.20527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.20491">arXiv:2503.20491</a> <span> [<a href="https://arxiv.org/pdf/2503.20491">pdf</a>, <a href="https://arxiv.org/format/2503.20491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VPO: Aligning Text-to-Video Generation Models with Prompt Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiale Cheng</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+R">Ruiliang Lyu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaotao Gu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiazheng Xu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yida Lu</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+J">Jiayan Teng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoyi Yang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.20491v1-abstract-short" style="display: inline;"> Video generation models have achieved remarkable progress in text-to-video tasks. These models are typically trained on text-video pairs with highly detailed and carefully crafted descriptions, while real-world user inputs during inference are often concise, vague, or poorly structured. This gap makes prompt optimization crucial for generating high-quality videos. Current methods often rely on lar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20491v1-abstract-full').style.display = 'inline'; document.getElementById('2503.20491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.20491v1-abstract-full" style="display: none;"> Video generation models have achieved remarkable progress in text-to-video tasks. These models are typically trained on text-video pairs with highly detailed and carefully crafted descriptions, while real-world user inputs during inference are often concise, vague, or poorly structured. This gap makes prompt optimization crucial for generating high-quality videos. Current methods often rely on large language models (LLMs) to refine prompts through in-context learning, but suffer from several limitations: they may distort user intent, omit critical details, or introduce safety risks. Moreover, they optimize prompts without considering the impact on the final video quality, which can lead to suboptimal results. To address these issues, we introduce VPO, a principled framework that optimizes prompts based on three core principles: harmlessness, accuracy, and helpfulness. The generated prompts faithfully preserve user intents and, more importantly, enhance the safety and quality of generated videos. To achieve this, VPO employs a two-stage optimization approach. First, we construct and refine a supervised fine-tuning (SFT) dataset based on principles of safety and alignment. Second, we introduce both text-level and video-level feedback to further optimize the SFT model with preference learning. Our extensive experiments demonstrate that VPO significantly improves safety, alignment, and video quality compared to baseline methods. Moreover, VPO shows strong generalization across video generation models. Furthermore, we demonstrate that VPO could outperform and be combined with RLHF methods on video generation models, underscoring the effectiveness of VPO in aligning video generation models. Our code and data are publicly available at https://github.com/thu-coai/VPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.20491v1-abstract-full').style.display = 'none'; document.getElementById('2503.20491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19855">arXiv:2503.19855</a> <span> [<a href="https://arxiv.org/pdf/2503.19855">pdf</a>, <a href="https://arxiv.org/format/2503.19855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Think Twice: Enhancing LLM Reasoning by Scaling Multi-round Test-time Thinking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xiaoyu Tian</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuaiting Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yunjie Ji</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yiping Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19855v1-abstract-short" style="display: inline;"> Recent advances in large language models (LLMs), such as OpenAI-o1 and DeepSeek-R1, have demonstrated the effectiveness of test-time scaling, where extended reasoning processes substantially enhance model performance. Despite this, current models are constrained by limitations in handling long texts and reinforcement learning (RL) training efficiency. To address these issues, we propose a simple y… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19855v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19855v1-abstract-full" style="display: none;"> Recent advances in large language models (LLMs), such as OpenAI-o1 and DeepSeek-R1, have demonstrated the effectiveness of test-time scaling, where extended reasoning processes substantially enhance model performance. Despite this, current models are constrained by limitations in handling long texts and reinforcement learning (RL) training efficiency. To address these issues, we propose a simple yet effective test-time scaling approach Multi-round Thinking. This method iteratively refines model reasoning by leveraging previous answers as prompts for subsequent rounds. Extensive experiments across multiple models, including QwQ-32B and DeepSeek-R1, consistently show performance improvements on various benchmarks such as AIME 2024, MATH-500, GPQA-diamond, and LiveCodeBench. For instance, the accuracy of QwQ-32B improved from 80.3% (Round 1) to 82.1% (Round 2) on the AIME 2024 dataset, while DeepSeek-R1 showed a similar increase from 79.7% to 82.0%. These results confirm that Multi-round Thinking is a broadly applicable, straightforward approach to achieving stable enhancements in model performance, underscoring its potential for future developments in test-time scaling techniques. The key prompt: {Original question prompt} The assistant's previous answer is: <answer> {last round answer} </answer>, and please re-answer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19855v1-abstract-full').style.display = 'none'; document.getElementById('2503.19855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19801">arXiv:2503.19801</a> <span> [<a href="https://arxiv.org/pdf/2503.19801">pdf</a>, <a href="https://arxiv.org/format/2503.19801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SeLIP: Similarity Enhanced Contrastive Language Image Pretraining for Multi-modal Head MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minghao Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hanyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huiying Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+W">Wen Shen</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+C">Chao Chai</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Shuang Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19801v1-abstract-short" style="display: inline;"> Despite that deep learning (DL) methods have presented tremendous potential in many medical image analysis tasks, the practical applications of medical DL models are limited due to the lack of enough data samples with manual annotations. By noting that the clinical radiology examinations are associated with radiology reports that describe the images, we propose to develop a foundation model for mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19801v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19801v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19801v1-abstract-full" style="display: none;"> Despite that deep learning (DL) methods have presented tremendous potential in many medical image analysis tasks, the practical applications of medical DL models are limited due to the lack of enough data samples with manual annotations. By noting that the clinical radiology examinations are associated with radiology reports that describe the images, we propose to develop a foundation model for multi-model head MRI by using contrastive learning on the images and the corresponding radiology findings. In particular, a contrastive learning framework is proposed, where a mixed syntax and semantic similarity matching metric is integrated to reduce the thirst of extreme large dataset in conventional contrastive learning framework. Our proposed similarity enhanced contrastive language image pretraining (SeLIP) is able to effectively extract more useful features. Experiments revealed that our proposed SeLIP performs well in many downstream tasks including image-text retrieval task, classification task, and image segmentation, which highlights the importance of considering the similarities among texts describing different images in developing medical image foundation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19801v1-abstract-full').style.display = 'none'; document.getElementById('2503.19801v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19633">arXiv:2503.19633</a> <span> [<a href="https://arxiv.org/pdf/2503.19633">pdf</a>, <a href="https://arxiv.org/format/2503.19633">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> 1.4 Million Open-Source Distilled Reasoning Dataset to Empower Large Language Model Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Han Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yiping Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sitong Zhao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xiaoyu Tian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuaiting Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yunjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19633v1-abstract-short" style="display: inline;"> The AM-DeepSeek-R1-Distilled is a large-scale dataset with thinking traces for general reasoning tasks, composed of high-quality and challenging reasoning problems. These problems are collected from a multitude of open-source datasets, subjected to semantic deduplication and meticulous cleaning to eliminate test set contamination. All responses within the dataset are distilled from reasoning model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19633v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19633v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19633v1-abstract-full" style="display: none;"> The AM-DeepSeek-R1-Distilled is a large-scale dataset with thinking traces for general reasoning tasks, composed of high-quality and challenging reasoning problems. These problems are collected from a multitude of open-source datasets, subjected to semantic deduplication and meticulous cleaning to eliminate test set contamination. All responses within the dataset are distilled from reasoning models (predominantly DeepSeek-R1) and have undergone rigorous verification procedures. Mathematical problems are validated by checking against reference answers, code problems are verified using test cases, and other tasks are evaluated with the aid of a reward model. The AM-Distill-Qwen-32B model, which was trained through only simple Supervised Fine-Tuning (SFT) using this batch of data, outperformed the DeepSeek-R1-Distill-Qwen-32B model on four benchmarks: AIME2024, MATH-500, GPQA-Diamond, and LiveCodeBench. Additionally, the AM-Distill-Qwen-72B model surpassed the DeepSeek-R1-Distill-Llama-70B model on all benchmarks as well. We are releasing these 1.4 million problems and their corresponding responses to the research community with the objective of fostering the development of powerful reasoning-oriented Large Language Models (LLMs). The dataset was published in \href{https://huggingface.co/datasets/a-m-team/AM-DeepSeek-R1-Distilled-1.4M}{https://huggingface.co/datasets/a-m-team/AM-DeepSeek-R1-Distilled-1.4M}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19633v1-abstract-full').style.display = 'none'; document.getElementById('2503.19633v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19591">arXiv:2503.19591</a> <span> [<a href="https://arxiv.org/pdf/2503.19591">pdf</a>, <a href="https://arxiv.org/format/2503.19591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Boosting the Transferability of Audio Adversarial Examples with Acoustic Representation Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+W">Weifei Jin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Junjie Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hejia Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yulin Ye</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+J">Jie Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19591v1-abstract-short" style="display: inline;"> With the widespread application of automatic speech recognition (ASR) systems, their vulnerability to adversarial attacks has been extensively studied. However, most existing adversarial examples are generated on specific individual models, resulting in a lack of transferability. In real-world scenarios, attackers often cannot access detailed information about the target model, making query-based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19591v1-abstract-full').style.display = 'inline'; document.getElementById('2503.19591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19591v1-abstract-full" style="display: none;"> With the widespread application of automatic speech recognition (ASR) systems, their vulnerability to adversarial attacks has been extensively studied. However, most existing adversarial examples are generated on specific individual models, resulting in a lack of transferability. In real-world scenarios, attackers often cannot access detailed information about the target model, making query-based attacks unfeasible. To address this challenge, we propose a technique called Acoustic Representation Optimization that aligns adversarial perturbations with low-level acoustic characteristics derived from speech representation models. Rather than relying on model-specific, higher-layer abstractions, our approach leverages fundamental acoustic representations that remain consistent across diverse ASR architectures. By enforcing an acoustic representation loss to guide perturbations toward these robust, lower-level representations, we enhance the cross-model transferability of adversarial examples without degrading audio quality. Our method is plug-and-play and can be integrated with any existing attack methods. We evaluate our approach on three modern ASR models, and the experimental results demonstrate that our method significantly improves the transferability of adversarial examples generated by previous methods while preserving the audio quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19591v1-abstract-full').style.display = 'none'; document.getElementById('2503.19591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICME 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.19470">arXiv:2503.19470</a> <span> [<a href="https://arxiv.org/pdf/2503.19470">pdf</a>, <a href="https://arxiv.org/format/2503.19470">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ReSearch: Learning to Reason with Search for LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingyang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoze Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yijie Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenzheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haofen Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J+Z">Jeff Z. Pan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huajun Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zenan Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.19470v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable capabilities in reasoning, exemplified by the success of OpenAI-o1 and DeepSeek-R1. However, integrating reasoning with external search processes remains challenging, especially for complex multi-hop questions requiring multiple retrieval steps. We propose ReSearch, a novel framework that trains LLMs to Reason with Search via reinforcement learnin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19470v2-abstract-full').style.display = 'inline'; document.getElementById('2503.19470v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.19470v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable capabilities in reasoning, exemplified by the success of OpenAI-o1 and DeepSeek-R1. However, integrating reasoning with external search processes remains challenging, especially for complex multi-hop questions requiring multiple retrieval steps. We propose ReSearch, a novel framework that trains LLMs to Reason with Search via reinforcement learning without using any supervised data on reasoning steps. Our approach treats search operations as integral components of the reasoning chain, where when and how to perform searches is guided by text-based thinking, and search results subsequently influence further reasoning. We train ReSearch on Qwen2.5-7B(-Instruct) and Qwen2.5-32B(-Instruct) models and conduct extensive experiments. Despite being trained on only one dataset, our models demonstrate strong generalizability across various benchmarks. Analysis reveals that ReSearch naturally elicits advanced reasoning capabilities such as reflection and self-correction during the reinforcement learning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.19470v2-abstract-full').style.display = 'none'; document.getElementById('2503.19470v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18942">arXiv:2503.18942</a> <span> [<a href="https://arxiv.org/pdf/2503.18942">pdf</a>, <a href="https://arxiv.org/format/2503.18942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Video-T1: Test-Time Scaling for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fangfu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanyang Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yimo Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+X">Xiaohang Zhan</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yueqi Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18942v1-abstract-short" style="display: inline;"> With the scale capability of increasing training data, model size, and computational cost, video generation has achieved impressive results in digital creation, enabling users to express creativity across various domains. Recently, researchers in Large Language Models (LLMs) have expanded the scaling to test-time, which can significantly improve LLM performance by using more inference-time computa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18942v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18942v1-abstract-full" style="display: none;"> With the scale capability of increasing training data, model size, and computational cost, video generation has achieved impressive results in digital creation, enabling users to express creativity across various domains. Recently, researchers in Large Language Models (LLMs) have expanded the scaling to test-time, which can significantly improve LLM performance by using more inference-time computation. Instead of scaling up video foundation models through expensive training costs, we explore the power of Test-Time Scaling (TTS) in video generation, aiming to answer the question: if a video generation model is allowed to use non-trivial amount of inference-time compute, how much can it improve generation quality given a challenging text prompt. In this work, we reinterpret the test-time scaling of video generation as a searching problem to sample better trajectories from Gaussian noise space to the target video distribution. Specifically, we build the search space with test-time verifiers to provide feedback and heuristic algorithms to guide searching process. Given a text prompt, we first explore an intuitive linear search strategy by increasing noise candidates at inference time. As full-step denoising all frames simultaneously requires heavy test-time computation costs, we further design a more efficient TTS method for video generation called Tree-of-Frames (ToF) that adaptively expands and prunes video branches in an autoregressive manner. Extensive experiments on text-conditioned video generation benchmarks demonstrate that increasing test-time compute consistently leads to significant improvements in the quality of videos. Project page: https://liuff19.github.io/Video-T1 <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18942v1-abstract-full').style.display = 'none'; document.getElementById('2503.18942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://liuff19.github.io/Video-T1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18830">arXiv:2503.18830</a> <span> [<a href="https://arxiv.org/pdf/2503.18830">pdf</a>, <a href="https://arxiv.org/format/2503.18830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DAGait: Generalized Skeleton-Guided Data Alignment for Gait Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhengxian Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuanrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hangrui Xu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+P">Peng Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18830v1-abstract-short" style="display: inline;"> Gait recognition is emerging as a promising and innovative area within the field of computer vision, widely applied to remote person identification. Although existing gait recognition methods have achieved substantial success in controlled laboratory datasets, their performance often declines significantly when transitioning to wild datasets.We argue that the performance gap can be primarily attri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18830v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18830v1-abstract-full" style="display: none;"> Gait recognition is emerging as a promising and innovative area within the field of computer vision, widely applied to remote person identification. Although existing gait recognition methods have achieved substantial success in controlled laboratory datasets, their performance often declines significantly when transitioning to wild datasets.We argue that the performance gap can be primarily attributed to the spatio-temporal distribution inconsistencies present in wild datasets, where subjects appear at varying angles, positions, and distances across the frames. To achieve accurate gait recognition in the wild, we propose a skeleton-guided silhouette alignment strategy, which uses prior knowledge of the skeletons to perform affine transformations on the corresponding silhouettes.To the best of our knowledge, this is the first study to explore the impact of data alignment on gait recognition. We conducted extensive experiments across multiple datasets and network architectures, and the results demonstrate the significant advantages of our proposed alignment strategy.Specifically, on the challenging Gait3D dataset, our method achieved an average performance improvement of 7.9% across all evaluated networks. Furthermore, our method achieves substantial improvements on cross-domain datasets, with accuracy improvements of up to 24.0%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18830v1-abstract-full').style.display = 'none'; document.getElementById('2503.18830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18808">arXiv:2503.18808</a> <span> [<a href="https://arxiv.org/pdf/2503.18808">pdf</a>, <a href="https://arxiv.org/format/2503.18808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CRCL: Causal Representation Consistency Learning for Anomaly Detection in Surveillance Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongjin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zepu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaoguang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jing Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peng Sun</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Rui Tang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jianwei Du</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Liang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18808v1-abstract-short" style="display: inline;"> Video Anomaly Detection (VAD) remains a fundamental yet formidable task in the video understanding community, with promising applications in areas such as information forensics and public safety protection. Due to the rarity and diversity of anomalies, existing methods only use easily collected regular events to model the inherent normality of normal spatial-temporal patterns in an unsupervised ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18808v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18808v1-abstract-full" style="display: none;"> Video Anomaly Detection (VAD) remains a fundamental yet formidable task in the video understanding community, with promising applications in areas such as information forensics and public safety protection. Due to the rarity and diversity of anomalies, existing methods only use easily collected regular events to model the inherent normality of normal spatial-temporal patterns in an unsupervised manner. Previous studies have shown that existing unsupervised VAD models are incapable of label-independent data offsets (e.g., scene changes) in real-world scenarios and may fail to respond to light anomalies due to the overgeneralization of deep neural networks. Inspired by causality learning, we argue that there exist causal factors that can adequately generalize the prototypical patterns of regular events and present significant deviations when anomalous instances occur. In this regard, we propose Causal Representation Consistency Learning (CRCL) to implicitly mine potential scene-robust causal variable in unsupervised video normality learning. Specifically, building on the structural causal models, we propose scene-debiasing learning and causality-inspired normality learning to strip away entangled scene bias in deep representations and learn causal video normality, respectively. Extensive experiments on benchmarks validate the superiority of our method over conventional deep representation learning. Moreover, ablation studies and extension validation show that the CRCL can cope with label-independent biases in multi-scene settings and maintain stable performance with only limited training data available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18808v1-abstract-full').style.display = 'none'; document.getElementById('2503.18808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication by IEEE Transactions on Image Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18666">arXiv:2503.18666</a> <span> [<a href="https://arxiv.org/pdf/2503.18666">pdf</a>, <a href="https://arxiv.org/format/2503.18666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AgentSpec: Customizable Runtime Enforcement for Safe and Reliable LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Poskitt%2C+C+M">Christopher M. Poskitt</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18666v1-abstract-short" style="display: inline;"> Agents built on LLMs are increasingly deployed across diverse domains, automating complex decision-making and task execution. However, their autonomy introduces safety risks, including security vulnerabilities, legal violations, and unintended harmful actions. Existing mitigation methods, such as model-based safeguards and early enforcement strategies, fall short in robustness, interpretability, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18666v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18666v1-abstract-full" style="display: none;"> Agents built on LLMs are increasingly deployed across diverse domains, automating complex decision-making and task execution. However, their autonomy introduces safety risks, including security vulnerabilities, legal violations, and unintended harmful actions. Existing mitigation methods, such as model-based safeguards and early enforcement strategies, fall short in robustness, interpretability, and adaptability. To address these challenges, we propose AgentSpec, a lightweight domain-specific language for specifying and enforcing runtime constraints on LLM agents. With AgentSpec, users define structured rules that incorporate triggers, predicates, and enforcement mechanisms, ensuring agents operate within predefined safety boundaries. We implement AgentSpec across multiple domains, including code execution, embodied agents, and autonomous driving, demonstrating its adaptability and effectiveness. Our evaluation shows that AgentSpec successfully prevents unsafe executions in over 90% of code agent cases, eliminates all hazardous actions in embodied agent tasks, and enforces 100% compliance by autonomous vehicles (AVs). Despite its strong safety guarantees, AgentSpec remains computationally lightweight, with overheads in milliseconds. By combining interpretability, modularity, and efficiency, AgentSpec provides a practical and scalable solution for enforcing LLM agent safety across diverse applications. We also automate the generation of rules using LLMs and assess their effectiveness. Our evaluation shows that the rules generated by OpenAI o1 achieve a precision of 95.56% and recall of 70.96% for embodied agents, successfully identifying 87.26% of the risky code, and prevent AVs from breaking laws in 5 out of 8 scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18666v1-abstract-full').style.display = 'none'; document.getElementById('2503.18666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18640">arXiv:2503.18640</a> <span> [<a href="https://arxiv.org/pdf/2503.18640">pdf</a>, <a href="https://arxiv.org/format/2503.18640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LLGS: Unsupervised Gaussian Splatting for Image Enhancement and Reconstruction in Pure Dark Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwei Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lu Yang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+T">Tianchen Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gaojing Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingrui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18640v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting has shown remarkable capabilities in novel view rendering tasks and exhibits significant potential for multi-view optimization.However, the original 3D Gaussian Splatting lacks color representation for inputs in low-light environments. Simply using enhanced images as inputs would lead to issues with multi-view consistency, and current single-view enhancement systems rely on p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18640v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18640v1-abstract-full" style="display: none;"> 3D Gaussian Splatting has shown remarkable capabilities in novel view rendering tasks and exhibits significant potential for multi-view optimization.However, the original 3D Gaussian Splatting lacks color representation for inputs in low-light environments. Simply using enhanced images as inputs would lead to issues with multi-view consistency, and current single-view enhancement systems rely on pre-trained data, lacking scene generalization. These problems limit the application of 3D Gaussian Splatting in low-light conditions in the field of robotics, including high-fidelity modeling and feature matching. To address these challenges, we propose an unsupervised multi-view stereoscopic system based on Gaussian Splatting, called Low-Light Gaussian Splatting (LLGS). This system aims to enhance images in low-light environments while reconstructing the scene. Our method introduces a decomposable Gaussian representation called M-Color, which separately characterizes color information for targeted enhancement. Furthermore, we propose an unsupervised optimization method with zero-knowledge priors, using direction-based enhancement to ensure multi-view consistency. Experiments conducted on real-world datasets demonstrate that our system outperforms state-of-the-art methods in both low-light enhancement and 3D Gaussian Splatting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18640v1-abstract-full').style.display = 'none'; document.getElementById('2503.18640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18458">arXiv:2503.18458</a> <span> [<a href="https://arxiv.org/pdf/2503.18458">pdf</a>, <a href="https://arxiv.org/format/2503.18458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> StableGS: A Floater-Free Framework for 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luchao Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Q">Qian Ren</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+K">Kaimin Liao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hua Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yaohua Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18458v2-abstract-short" style="display: inline;"> Recent years have witnessed remarkable success of 3D Gaussian Splatting (3DGS) in novel view synthesis, surpassing prior differentiable rendering methods in both quality and efficiency. However, its training process suffers from coupled opacity-color optimization that frequently converges to local minima, producing floater artifacts that degrade visual fidelity. We present StableGS, a framework th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18458v2-abstract-full').style.display = 'inline'; document.getElementById('2503.18458v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18458v2-abstract-full" style="display: none;"> Recent years have witnessed remarkable success of 3D Gaussian Splatting (3DGS) in novel view synthesis, surpassing prior differentiable rendering methods in both quality and efficiency. However, its training process suffers from coupled opacity-color optimization that frequently converges to local minima, producing floater artifacts that degrade visual fidelity. We present StableGS, a framework that eliminates floaters through cross-view depth consistency constraints while introducing a dual-opacity GS model to decouple geometry and material properties of translucent objects. To further enhance reconstruction quality in weakly-textured regions, we integrate DUSt3R depth estimation, significantly improving geometric stability. Our method fundamentally addresses 3DGS training instabilities, outperforming existing state-of-the-art methods across open-source datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18458v2-abstract-full').style.display = 'none'; document.getElementById('2503.18458v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18385">arXiv:2503.18385</a> <span> [<a href="https://arxiv.org/pdf/2503.18385">pdf</a>, <a href="https://arxiv.org/format/2503.18385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RoCA: Robust Contrastive One-class Time Series Anomaly Detection with Contaminated Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mou%2C+X">Xudong Mou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Wo%2C+T">Tianyu Wo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jie Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xudong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18385v1-abstract-short" style="display: inline;"> The accumulation of time-series signals and the absence of labels make time-series Anomaly Detection (AD) a self-supervised task of deep learning. Methods based on normality assumptions face the following three limitations: (1) A single assumption could hardly characterize the whole normality or lead to some deviation. (2) Some assumptions may go against the principle of AD. (3) Their basic assump… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18385v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18385v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18385v1-abstract-full" style="display: none;"> The accumulation of time-series signals and the absence of labels make time-series Anomaly Detection (AD) a self-supervised task of deep learning. Methods based on normality assumptions face the following three limitations: (1) A single assumption could hardly characterize the whole normality or lead to some deviation. (2) Some assumptions may go against the principle of AD. (3) Their basic assumption is that the training data is uncontaminated (free of anomalies), which is unrealistic in practice, leading to a decline in robustness. This paper proposes a novel robust approach, RoCA, which is the first to address all of the above three challenges, as far as we are aware. It fuses the separated assumptions of one-class classification and contrastive learning in a single training process to characterize a more complete so-called normality. Additionally, it monitors the training data and computes a carefully designed anomaly score throughout the training process. This score helps identify latent anomalies, which are then used to define the classification boundary, inspired by the concept of outlier exposure. The performance on AIOps datasets improved by 6% compared to when contamination was not considered (COCA). On two large and high-dimensional multivariate datasets, the performance increased by 5% to 10%. RoCA achieves the highest average performance on both univariate and multivariate datasets. The source code is available at https://github.com/ruiking04/RoCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18385v1-abstract-full').style.display = 'none'; document.getElementById('2503.18385v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18302">arXiv:2503.18302</a> <span> [<a href="https://arxiv.org/pdf/2503.18302">pdf</a>, <a href="https://arxiv.org/format/2503.18302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffMove: Group Mobility Tendency Enhanced Trajectory Recovery via Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qingyue Long</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+C">Can Rong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huandong Wang</a>, <a href="/search/cs?searchtype=author&query=Rajib%2C+S">Shaw Rajib</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18302v1-abstract-short" style="display: inline;"> In the real world, trajectory data is often sparse and incomplete due to low collection frequencies or limited device coverage. Trajectory recovery aims to recover these missing trajectory points, making the trajectories denser and more complete. However, this task faces two key challenges: 1) The excessive sparsity of individual trajectories makes it difficult to effectively leverage historical i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18302v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18302v1-abstract-full" style="display: none;"> In the real world, trajectory data is often sparse and incomplete due to low collection frequencies or limited device coverage. Trajectory recovery aims to recover these missing trajectory points, making the trajectories denser and more complete. However, this task faces two key challenges: 1) The excessive sparsity of individual trajectories makes it difficult to effectively leverage historical information for recovery; 2) Sparse trajectories make it harder to capture complex individual mobility preferences. To address these challenges, we propose a novel method called DiffMove. Firstly, we harness crowd wisdom for trajectory recovery. Specifically, we construct a group tendency graph using the collective trajectories of all users and then integrate the group mobility trends into the location representations via graph embedding. This solves the challenge of sparse trajectories being unable to rely on individual historical trajectories for recovery. Secondly, we capture individual mobility preferences from both historical and current perspectives. Finally, we integrate group mobility tendencies and individual preferences into the spatiotemporal distribution of the trajectory to recover high-quality trajectories. Extensive experiments on two real-world datasets demonstrate that DiffMove outperforms existing state-of-the-art methods. Further analysis validates the robustness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18302v1-abstract-full').style.display = 'none'; document.getElementById('2503.18302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18245">arXiv:2503.18245</a> <span> [<a href="https://arxiv.org/pdf/2503.18245">pdf</a>, <a href="https://arxiv.org/format/2503.18245">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DiffGED: Computing Graph Edit Distance via Diffusion-based Graph Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanchen Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+D">Dong Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xuemin Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18245v1-abstract-short" style="display: inline;"> The Graph Edit Distance (GED) problem, which aims to compute the minimum number of edit operations required to transform one graph into another, is a fundamental challenge in graph analysis with wide-ranging applications. However, due to its NP-hard nature, traditional A* approaches often suffer from scalability issue, making them computationally intractable for large graphs. Many recent deep lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18245v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18245v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18245v1-abstract-full" style="display: none;"> The Graph Edit Distance (GED) problem, which aims to compute the minimum number of edit operations required to transform one graph into another, is a fundamental challenge in graph analysis with wide-ranging applications. However, due to its NP-hard nature, traditional A* approaches often suffer from scalability issue, making them computationally intractable for large graphs. Many recent deep learning frameworks address GED by formulating it as a regression task, which, while efficient, fails to recover the edit path -- a central interest in GED. Furthermore, recent hybrid approaches that combine deep learning with traditional methods to recover the edit path often yield poor solution quality. These methods also struggle to generate candidate solutions in parallel, resulting in increased running times.In this paper, we present a novel approach, DiffGED, that leverages generative diffusion model to solve GED and recover the corresponding edit path. Specifically, we first generate multiple diverse node matching matrices in parallel through a diffusion-based graph matching model. Next, node mappings are extracted from each generated matching matrices in parallel, and each extracted node mapping can be simply transformed into an edit path. Benefiting from the generative diversity provided by the diffusion model, DiffGED is less likely to fall into local sub-optimal solutions, thereby achieving superior overall solution quality close to the exact solution. Experimental results on real-world datasets demonstrate that DiffGED can generate multiple diverse edit paths with exceptionally high accuracy comparable to exact solutions while maintaining a running time shorter than most of hybrid approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18245v1-abstract-full').style.display = 'none'; document.getElementById('2503.18245v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17675">arXiv:2503.17675</a> <span> [<a href="https://arxiv.org/pdf/2503.17675">pdf</a>, <a href="https://arxiv.org/format/2503.17675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Transformer-Based Aligned Generation with Self-Coherence Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shulei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wang Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hai Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanting Wang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+S">Sihang Cai</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">WenKang Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+T">Tao Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiacheng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jieming Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhou Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17675v1-abstract-short" style="display: inline;"> We introduce a novel, training-free approach for enhancing alignment in Transformer-based Text-Guided Diffusion Models (TGDMs). Existing TGDMs often struggle to generate semantically aligned images, particularly when dealing with complex text prompts or multi-concept attribute binding challenges. Previous U-Net-based methods primarily optimized the latent space, but their direct application to Tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17675v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17675v1-abstract-full" style="display: none;"> We introduce a novel, training-free approach for enhancing alignment in Transformer-based Text-Guided Diffusion Models (TGDMs). Existing TGDMs often struggle to generate semantically aligned images, particularly when dealing with complex text prompts or multi-concept attribute binding challenges. Previous U-Net-based methods primarily optimized the latent space, but their direct application to Transformer-based architectures has shown limited effectiveness. Our method addresses these challenges by directly optimizing cross-attention maps during the generation process. Specifically, we introduce Self-Coherence Guidance, a method that dynamically refines attention maps using masks derived from previous denoising steps, ensuring precise alignment without additional training. To validate our approach, we constructed more challenging benchmarks for evaluating coarse-grained attribute binding, fine-grained attribute binding, and style binding. Experimental results demonstrate the superior performance of our method, significantly surpassing other state-of-the-art methods across all evaluated tasks. Our code is available at https://scg-diffusion.github.io/scg-diffusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17675v1-abstract-full').style.display = 'none'; document.getElementById('2503.17675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17670">arXiv:2503.17670</a> <span> [<a href="https://arxiv.org/pdf/2503.17670">pdf</a>, <a href="https://arxiv.org/format/2503.17670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Do You "Trust" This Visualization? An Inventory to Measure Trust in Visualizations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H+W">Huichen Will Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kylie Lin</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&query=Kennedy%2C+R">Ryan Kennedy</a>, <a href="/search/cs?searchtype=author&query=Zwald%2C+Z">Zach Zwald</a>, <a href="/search/cs?searchtype=author&query=Nobre%2C+C">Carolina Nobre</a>, <a href="/search/cs?searchtype=author&query=Bearfield%2C+C+X">Cindy Xiong Bearfield</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17670v1-abstract-short" style="display: inline;"> Trust plays a critical role in visual data communication and decision-making, yet existing visualization research employs varied trust measures, making it challenging to compare and synthesize findings across studies. In this work, we first took a bottom-up, data-driven approach to understand what visualization readers mean when they say they "trust" a visualization. We compiled and adapted a broa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17670v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17670v1-abstract-full" style="display: none;"> Trust plays a critical role in visual data communication and decision-making, yet existing visualization research employs varied trust measures, making it challenging to compare and synthesize findings across studies. In this work, we first took a bottom-up, data-driven approach to understand what visualization readers mean when they say they "trust" a visualization. We compiled and adapted a broad set of trust-related statements from existing inventories and collected responses on visualizations with varying degrees of trustworthiness. Through exploratory factor analysis, we derived an operational definition of trust in visualizations. Our findings indicate that people perceive a trustworthy visualization as one that presents credible information and is comprehensible and usable. Additionally, we found that general trust disposition influences how individuals assess visualization trustworthiness. Building on these insights, we developed a compact inventory consisting of statements that not only effectively represent each trust factor but also exhibit high item discrimination. We further validated our inventory through two trust games with real-world stakes, demonstrating that our measures reliably predict behavioral trust. Finally, we illustrate how this standardized inventory can be applied across diverse visualization research contexts. Utilizing our inventory, future research can examine how design choices, tasks, and domains influence trust, and how to foster appropriate trusting behavior in human-data interactions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17670v1-abstract-full').style.display = 'none'; document.getElementById('2503.17670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17355">arXiv:2503.17355</a> <span> [<a href="https://arxiv.org/pdf/2503.17355">pdf</a>, <a href="https://arxiv.org/format/2503.17355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Glivenko-Cantelli for $f$-divergence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoming Wang</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+L">Lek-Heng Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17355v2-abstract-short" style="display: inline;"> We extend the celebrated Glivenko-Cantelli theorem, sometimes called the fundamental theorem of statistics, from its standard setting of total variation distance to all $f$-divergences. A key obstacle in this endeavor is to define $f$-divergence on a subcollection of a $蟽$-algebra that forms a $蟺$-system but not a $蟽$-subalgebra. This is a side contribution of our work. We will show that this noti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17355v2-abstract-full').style.display = 'inline'; document.getElementById('2503.17355v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17355v2-abstract-full" style="display: none;"> We extend the celebrated Glivenko-Cantelli theorem, sometimes called the fundamental theorem of statistics, from its standard setting of total variation distance to all $f$-divergences. A key obstacle in this endeavor is to define $f$-divergence on a subcollection of a $蟽$-algebra that forms a $蟺$-system but not a $蟽$-subalgebra. This is a side contribution of our work. We will show that this notion of $f$-divergence on the $蟺$-system of rays preserves nearly all known properties of standard $f$-divergence, yields a novel integral representation of the Kolmogorov-Smirnov distance, and has a Glivenko-Cantelli theorem. We will also discuss the prospects of a Vapnik-Chervonenkis theory for $f$-divergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17355v2-abstract-full').style.display = 'none'; document.getElementById('2503.17355v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 1 figure</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 60B10; 60F15; 60F25 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17343">arXiv:2503.17343</a> <span> [<a href="https://arxiv.org/pdf/2503.17343">pdf</a>, <a href="https://arxiv.org/format/2503.17343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Commercial Dishes Can Be My Ladder: Sustainable and Collaborative Data Offloading in LEO Satellite Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+Y+C">Yi Ching Chou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hengzhi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feng Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xiaoyi Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiangchuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17343v1-abstract-short" style="display: inline;"> Low Earth Orbit (LEO) satellite networks, characterized by their high data throughput and low latency, have gained significant interest from both industry and academia. Routing data efficiently within these networks is essential for maintaining a high quality of service. However, current routing strategies, such as bent-pipe and inter-satellite link (ISL) routing, have their unique challenges. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17343v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17343v1-abstract-full" style="display: none;"> Low Earth Orbit (LEO) satellite networks, characterized by their high data throughput and low latency, have gained significant interest from both industry and academia. Routing data efficiently within these networks is essential for maintaining a high quality of service. However, current routing strategies, such as bent-pipe and inter-satellite link (ISL) routing, have their unique challenges. The bent-pipe strategy requires a dense deployment of dedicated ground stations, while the ISL-based strategy can negatively impact satellite battery lifespan due to increased traffic load, leading to sustainability issues. In this paper, we propose sustainable collaborative offloading, a framework that orchestrates groups of existing commercial resources like ground stations and 5G base stations for data offloading. This orchestration enhances total capacity, overcoming the limitations of a single resource. We propose the collaborator group set construction algorithm to construct candidate groups and the collaborator selection and total payment algorithm to select offloading targets and determine payments no less than the costs. Extensive real-world-based simulations show that our solution significantly improves energy consumption, satellite service life, and end-to-end latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17343v1-abstract-full').style.display = 'none'; document.getElementById('2503.17343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a preliminary extended version of the paper accepted to INFOCOM 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17106">arXiv:2503.17106</a> <span> [<a href="https://arxiv.org/pdf/2503.17106">pdf</a>, <a href="https://arxiv.org/format/2503.17106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GAA-TSO: Geometry-Aware Assisted Depth Completion for Transparent and Specular Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yizhe Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+T">Tong Jia</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+D">Da Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongyue Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17106v1-abstract-short" style="display: inline;"> Transparent and specular objects are frequently encountered in daily life, factories, and laboratories. However, due to the unique optical properties, the depth information on these objects is usually incomplete and inaccurate, which poses significant challenges for downstream robotics tasks. Therefore, it is crucial to accurately restore the depth information of transparent and specular objects.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17106v1-abstract-full').style.display = 'inline'; document.getElementById('2503.17106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17106v1-abstract-full" style="display: none;"> Transparent and specular objects are frequently encountered in daily life, factories, and laboratories. However, due to the unique optical properties, the depth information on these objects is usually incomplete and inaccurate, which poses significant challenges for downstream robotics tasks. Therefore, it is crucial to accurately restore the depth information of transparent and specular objects. Previous depth completion methods for these objects usually use RGB information as an additional channel of the depth image to perform depth prediction. Due to the poor-texture characteristics of transparent and specular objects, these methods that rely heavily on color information tend to generate structure-less depth predictions. Moreover, these 2D methods cannot effectively explore the 3D structure hidden in the depth channel, resulting in depth ambiguity. To this end, we propose a geometry-aware assisted depth completion method for transparent and specular objects, which focuses on exploring the 3D structural cues of the scene. Specifically, besides extracting 2D features from RGB-D input, we back-project the input depth to a point cloud and build the 3D branch to extract hierarchical scene-level 3D structural features. To exploit 3D geometric information, we design several gated cross-modal fusion modules to effectively propagate multi-level 3D geometric features to the image branch. In addition, we propose an adaptive correlation aggregation strategy to appropriately assign 3D features to the corresponding 2D features. Extensive experiments on ClearGrasp, OOD, TransCG, and STD datasets show that our method outperforms other state-of-the-art methods. We further demonstrate that our method significantly enhances the performance of downstream robotic grasping tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17106v1-abstract-full').style.display = 'none'; document.getElementById('2503.17106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16913">arXiv:2503.16913</a> <span> [<a href="https://arxiv.org/pdf/2503.16913">pdf</a>, <a href="https://arxiv.org/format/2503.16913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> FAIT: Fault-Aware Fine-Tuning for Better Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lishui Fan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhongxin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoye Wang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+L">Lingfeng Bao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shanping Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16913v1-abstract-short" style="display: inline;"> Modern instruction-tuned large language models (LLMs) have made remarkable progress in code generation. However, these LLMs fine-tuned with standard supervised fine-tuning (SFT) sometimes generate plausible-looking but functionally incorrect code variants. This issue likely stems from the limitation of standard SFT, which treats all tokens equally during optimization and fails to emphasize the err… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16913v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16913v1-abstract-full" style="display: none;"> Modern instruction-tuned large language models (LLMs) have made remarkable progress in code generation. However, these LLMs fine-tuned with standard supervised fine-tuning (SFT) sometimes generate plausible-looking but functionally incorrect code variants. This issue likely stems from the limitation of standard SFT, which treats all tokens equally during optimization and fails to emphasize the error-sensitive segments-specific code differences between correct implementations and similar incorrect variants. To address this problem, we propose Fault-Aware Fine-Tuning (FAIT), a novel fine-tuning technique that enhances LLMs' code generation by (1) extracting multi-granularity (line/token-level) differences between correct and incorrect yet similar implementations to identify error-sensitive segments, and (2) dynamically prioritizing those segments during training via dynamic loss weighting. Through extensive experiments on seven LLMs across three widely-used benchmarks, our method achieves an average relative improvement of 6.9% on pass@1 with just one epoch of training, with some enhanced 6.7B LLMs outperforming closed-source models, e.g., GPT-3.5-Turbo. Furthermore, our fine-tuning technique demonstrates strong generalization with performance improvements ranging from 3.8% to 19.1% across diverse instruction-tuned LLMs, and our ablation studies confirm the contributions of different granularities of differences and loss function components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16913v1-abstract-full').style.display = 'none'; document.getElementById('2503.16913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16806">arXiv:2503.16806</a> <span> [<a href="https://arxiv.org/pdf/2503.16806">pdf</a>, <a href="https://arxiv.org/format/2503.16806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DyWA: Dynamics-adaptive World Action Model for Generalizable Non-prehensile Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+J">Jiangran Lyu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziming Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xuesong Shi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chaoyi Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">He Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16806v1-abstract-short" style="display: inline;"> Nonprehensile manipulation is crucial for handling objects that are too thin, large, or otherwise ungraspable in unstructured environments. While conventional planning-based approaches struggle with complex contact modeling, learning-based methods have recently emerged as a promising alternative. However, existing learning-based approaches face two major limitations: they heavily rely on multi-vie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16806v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16806v1-abstract-full" style="display: none;"> Nonprehensile manipulation is crucial for handling objects that are too thin, large, or otherwise ungraspable in unstructured environments. While conventional planning-based approaches struggle with complex contact modeling, learning-based methods have recently emerged as a promising alternative. However, existing learning-based approaches face two major limitations: they heavily rely on multi-view cameras and precise pose tracking, and they fail to generalize across varying physical conditions, such as changes in object mass and table friction. To address these challenges, we propose the Dynamics-Adaptive World Action Model (DyWA), a novel framework that enhances action learning by jointly predicting future states while adapting to dynamics variations based on historical trajectories. By unifying the modeling of geometry, state, physics, and robot actions, DyWA enables more robust policy learning under partial observability. Compared to baselines, our method improves the success rate by 31.5% using only single-view point cloud observations in the simulation. Furthermore, DyWA achieves an average success rate of 68% in real-world experiments, demonstrating its ability to generalize across diverse object geometries, adapt to varying table friction, and robustness in challenging scenarios such as half-filled water bottles and slippery surfaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16806v1-abstract-full').style.display = 'none'; document.getElementById('2503.16806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page:https://pku-epic.github.io/DyWA/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16740">arXiv:2503.16740</a> <span> [<a href="https://arxiv.org/pdf/2503.16740">pdf</a>, <a href="https://arxiv.org/format/2503.16740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Automated Harmfulness Testing for Code Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+H">Honghao Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haibo Wang</a>, <a href="/search/cs?searchtype=author&query=Pressato%2C+D">Diany Pressato</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yisen Xu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S+H">Shin Hwei Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16740v1-abstract-short" style="display: inline;"> Generative AI systems powered by Large Language Models (LLMs) usually use content moderation to prevent harmful content spread. To evaluate the robustness of content moderation, several metamorphic testing techniques have been proposed to test content moderation software. However, these techniques mainly focus on general users (e.g., text and image generation). Meanwhile, a recent study shows that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16740v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16740v1-abstract-full" style="display: none;"> Generative AI systems powered by Large Language Models (LLMs) usually use content moderation to prevent harmful content spread. To evaluate the robustness of content moderation, several metamorphic testing techniques have been proposed to test content moderation software. However, these techniques mainly focus on general users (e.g., text and image generation). Meanwhile, a recent study shows that developers consider using harmful keywords when naming software artifacts to be an unethical behavior. Exposure to harmful content in software artifacts can negatively impact the mental health of developers, making content moderation for Code Large Language Models (Code LLMs) essential. We conduct a preliminary study on program transformations that can be misused to introduce harmful content into auto-generated code, identifying 32 such transformations. To address this, we propose CHT, a coverage-guided harmfulness testing framework that generates prompts using diverse transformations and harmful keywords injected into benign programs. CHT evaluates output damage to assess potential risks in LLM-generated explanations and code. Our evaluation of four Code LLMs and GPT-4o-mini reveals that content moderation in LLM-based code generation is easily bypassed. To enhance moderation, we propose a two-phase approach that first detects harmful content before generating output, improving moderation effectiveness by 483.76\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16740v1-abstract-full').style.display = 'none'; document.getElementById('2503.16740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16653">arXiv:2503.16653</a> <span> [<a href="https://arxiv.org/pdf/2503.16653">pdf</a>, <a href="https://arxiv.org/format/2503.16653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> iFlame: Interleaving Full and Linear Attention for Efficient Mesh Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanxiao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Biao Zhang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+W">Weize Quan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Dong-Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Wonka%2C+P">Peter Wonka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16653v2-abstract-short" style="display: inline;"> This paper propose iFlame, a novel transformer-based network architecture for mesh generation. While attention-based models have demonstrated remarkable performance in mesh generation, their quadratic computational complexity limits scalability, particularly for high-resolution 3D data. Conversely, linear attention mechanisms offer lower computational costs but often struggle to capture long-range… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16653v2-abstract-full').style.display = 'inline'; document.getElementById('2503.16653v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16653v2-abstract-full" style="display: none;"> This paper propose iFlame, a novel transformer-based network architecture for mesh generation. While attention-based models have demonstrated remarkable performance in mesh generation, their quadratic computational complexity limits scalability, particularly for high-resolution 3D data. Conversely, linear attention mechanisms offer lower computational costs but often struggle to capture long-range dependencies, resulting in suboptimal outcomes. To address this trade-off, we propose an interleaving autoregressive mesh generation framework that combines the efficiency of linear attention with the expressive power of full attention mechanisms. To further enhance efficiency and leverage the inherent structure of mesh representations, we integrate this interleaving approach into an hourglass architecture, which significantly boosts efficiency. Our approach reduces training time while achieving performance comparable to pure attention-based models. To improve inference efficiency, we implemented a caching algorithm that almost doubles the speed and reduces the KV cache size by seven-eighths compared to the original Transformer. We evaluate our framework on ShapeNet and Objaverse, demonstrating its ability to generate high-quality 3D meshes efficiently. Our results indicate that the proposed interleaving framework effectively balances computational efficiency and generative performance, making it a practical solution for mesh generation. The training takes only 2 days with 4 GPUs on 39k data with a maximum of 4k faces on Objaverse. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16653v2-abstract-full').style.display = 'none'; document.getElementById('2503.16653v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website: https://wanghanxiao123.github.io/iFa/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16635">arXiv:2503.16635</a> <span> [<a href="https://arxiv.org/pdf/2503.16635">pdf</a>, <a href="https://arxiv.org/format/2503.16635">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fed-NDIF: A Noise-Embedded Federated Diffusion Model For Low-Count Whole-Body PET Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yinchi Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Huidong Xie</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Menghua Xia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Liang Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xinyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Biao Li</a>, <a href="/search/cs?searchtype=author&query=Rominger%2C+A">Axel Rominger</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+K">Kuangyu Shi</a>, <a href="/search/cs?searchtype=author&query=Dvorneka%2C+N+C">Nicha C. Dvorneka</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16635v1-abstract-short" style="display: inline;"> Low-count positron emission tomography (LCPET) imaging can reduce patients' exposure to radiation but often suffers from increased image noise and reduced lesion detectability, necessitating effective denoising techniques. Diffusion models have shown promise in LCPET denoising for recovering degraded image quality. However, training such models requires large and diverse datasets, which are challe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16635v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16635v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16635v1-abstract-full" style="display: none;"> Low-count positron emission tomography (LCPET) imaging can reduce patients' exposure to radiation but often suffers from increased image noise and reduced lesion detectability, necessitating effective denoising techniques. Diffusion models have shown promise in LCPET denoising for recovering degraded image quality. However, training such models requires large and diverse datasets, which are challenging to obtain in the medical domain. To address data scarcity and privacy concerns, we combine diffusion models with federated learning -- a decentralized training approach where models are trained individually at different sites, and their parameters are aggregated on a central server over multiple iterations. The variation in scanner types and image noise levels within and across institutions poses additional challenges for federated learning in LCPET denoising. In this study, we propose a novel noise-embedded federated learning diffusion model (Fed-NDIF) to address these challenges, leveraging a multicenter dataset and varying count levels. Our approach incorporates liver normalized standard deviation (NSTD) noise embedding into a 2.5D diffusion model and utilizes the Federated Averaging (FedAvg) algorithm to aggregate locally trained models into a global model, which is subsequently fine-tuned on local datasets to optimize performance and obtain personalized models. Extensive validation on datasets from the University of Bern, Ruijin Hospital in Shanghai, and Yale-New Haven Hospital demonstrates the superior performance of our method in enhancing image quality and improving lesion quantification. The Fed-NDIF model shows significant improvements in PSNR, SSIM, and NMSE of the entire 3D volume, as well as enhanced lesion detectability and quantification, compared to local diffusion models and federated UNet-based models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16635v1-abstract-full').style.display = 'none'; document.getElementById('2503.16635v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16578">arXiv:2503.16578</a> <span> [<a href="https://arxiv.org/pdf/2503.16578">pdf</a>, <a href="https://arxiv.org/format/2503.16578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SeniorTalk: A Chinese Conversation Dataset with Rich Annotations for Super-Aged Seniors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junyang Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiabei He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yequan Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yonghua Lin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16578v1-abstract-short" style="display: inline;"> While voice technologies increasingly serve aging populations, current systems exhibit significant performance gaps due to inadequate training data capturing elderly-specific vocal characteristics like presbyphonia and dialectal variations. The limited data available on super-aged individuals in existing elderly speech datasets, coupled with overly simple recording styles and annotation dimensions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16578v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16578v1-abstract-full" style="display: none;"> While voice technologies increasingly serve aging populations, current systems exhibit significant performance gaps due to inadequate training data capturing elderly-specific vocal characteristics like presbyphonia and dialectal variations. The limited data available on super-aged individuals in existing elderly speech datasets, coupled with overly simple recording styles and annotation dimensions, exacerbates this issue. To address the critical scarcity of speech data from individuals aged 75 and above, we introduce SeniorTalk, a carefully annotated Chinese spoken dialogue dataset. This dataset contains 55.53 hours of speech from 101 natural conversations involving 202 participants, ensuring a strategic balance across gender, region, and age. Through detailed annotation across multiple dimensions, it can support a wide range of speech tasks. We perform extensive experiments on speaker verification, speaker diarization, speech recognition, and speech editing tasks, offering crucial insights for the development of speech technologies targeting this age group. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16578v1-abstract-full').style.display = 'none'; document.getElementById('2503.16578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16454">arXiv:2503.16454</a> <span> [<a href="https://arxiv.org/pdf/2503.16454">pdf</a>, <a href="https://arxiv.org/format/2503.16454">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Audio-Visual Fusion Emotion Generation Model Based on Neuroanatomical Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haidong Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Q">Qia Shan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">JianHua Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">PengFei Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Ao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16454v1-abstract-short" style="display: inline;"> In the field of affective computing, traditional methods for generating emotions predominantly rely on deep learning techniques and large-scale emotion datasets. However, deep learning techniques are often complex and difficult to interpret, and standardizing large-scale emotional datasets are difficult and costly to establish. To tackle these challenges, we introduce a novel framework named Audio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16454v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16454v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16454v1-abstract-full" style="display: none;"> In the field of affective computing, traditional methods for generating emotions predominantly rely on deep learning techniques and large-scale emotion datasets. However, deep learning techniques are often complex and difficult to interpret, and standardizing large-scale emotional datasets are difficult and costly to establish. To tackle these challenges, we introduce a novel framework named Audio-Visual Fusion for Brain-like Emotion Learning(AVF-BEL). In contrast to conventional brain-inspired emotion learning methods, this approach improves the audio-visual emotion fusion and generation model through the integration of modular components, thereby enabling more lightweight and interpretable emotion learning and generation processes. The framework simulates the integration of the visual, auditory, and emotional pathways of the brain, optimizes the fusion of emotional features across visual and auditory modalities, and improves upon the traditional Brain Emotional Learning (BEL) model. The experimental results indicate a significant improvement in the similarity of the audio-visual fusion emotion learning generation model compared to single-modality visual and auditory emotion learning and generation model. Ultimately, this aligns with the fundamental phenomenon of heightened emotion generation facilitated by the integrated impact of visual and auditory stimuli. This contribution not only enhances the interpretability and efficiency of affective intelligence but also provides new insights and pathways for advancing affective computing technology. Our source code can be accessed here: https://github.com/OpenHUTB/emotion}{https://github.com/OpenHUTB/emotion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16454v1-abstract-full').style.display = 'none'; document.getElementById('2503.16454v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16402">arXiv:2503.16402</a> <span> [<a href="https://arxiv.org/pdf/2503.16402">pdf</a>, <a href="https://arxiv.org/format/2503.16402">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Emperor's New Clothes in Benchmarking? A Rigorous Examination of Mitigation Strategies for LLM Benchmark Data Contamination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yifan Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Han Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongbai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16402v1-abstract-short" style="display: inline;"> Benchmark Data Contamination (BDC)-the inclusion of benchmark testing samples in the training set-has raised increasing concerns in Large Language Model (LLM) evaluation, leading to falsely inflated performance estimates and undermining evaluation reliability. To address this, researchers have proposed various mitigation strategies to update existing benchmarks, including modifying original questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16402v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16402v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16402v1-abstract-full" style="display: none;"> Benchmark Data Contamination (BDC)-the inclusion of benchmark testing samples in the training set-has raised increasing concerns in Large Language Model (LLM) evaluation, leading to falsely inflated performance estimates and undermining evaluation reliability. To address this, researchers have proposed various mitigation strategies to update existing benchmarks, including modifying original questions or generating new ones based on them. However, a rigorous examination of the effectiveness of these mitigation strategies remains lacking. In this paper, we design a systematic and controlled pipeline along with two novel metrics-fidelity and contamination resistance-to provide a fine-grained and comprehensive assessment of existing BDC mitigation strategies. Previous assessment methods, such as accuracy drop and accuracy matching, focus solely on aggregate accuracy, often leading to incomplete or misleading conclusions. Our metrics address this limitation by emphasizing question-level evaluation result matching. Extensive experiments with 10 LLMs, 5 benchmarks, 20 BDC mitigation strategies, and 2 contamination scenarios reveal that no existing strategy significantly improves resistance over the vanilla case (i.e., no benchmark update) across all benchmarks, and none effectively balances fidelity and contamination resistance. These findings underscore the urgent need for designing more effective BDC mitigation strategies. Our code repository is available at https://github.com/ASTRAL-Group/BDC_mitigation_assessment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16402v1-abstract-full').style.display = 'none'; document.getElementById('2503.16402v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16257">arXiv:2503.16257</a> <span> [<a href="https://arxiv.org/pdf/2503.16257">pdf</a>, <a href="https://arxiv.org/format/2503.16257">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Plug-and-Play 1.x-Bit KV Cache Quantization for Video Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+K">Keda Tao</a>, <a href="/search/cs?searchtype=author&query=You%2C+H">Haoxuan You</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Y">Yang Sui</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+C">Can Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16257v1-abstract-short" style="display: inline;"> Video large language models (VideoLLMs) have demonstrated the capability to process longer video inputs and enable complex reasoning and analysis. However, due to the thousands of visual tokens from the video frames, key-value (KV) cache can significantly increase memory requirements, becoming a bottleneck for inference speed and memory usage. KV cache quantization is a widely used approach to add… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16257v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16257v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16257v1-abstract-full" style="display: none;"> Video large language models (VideoLLMs) have demonstrated the capability to process longer video inputs and enable complex reasoning and analysis. However, due to the thousands of visual tokens from the video frames, key-value (KV) cache can significantly increase memory requirements, becoming a bottleneck for inference speed and memory usage. KV cache quantization is a widely used approach to address this problem. In this paper, we find that 2-bit KV quantization of VideoLLMs can hardly hurt the model performance, while the limit of KV cache quantization in even lower bits has not been investigated. To bridge this gap, we introduce VidKV, a plug-and-play KV cache quantization method to compress the KV cache to lower than 2 bits. Specifically, (1) for key, we propose a mixed-precision quantization strategy in the channel dimension, where we perform 2-bit quantization for anomalous channels and 1-bit quantization combined with FFT for normal channels; (2) for value, we implement 1.58-bit quantization while selectively filtering semantically salient visual tokens for targeted preservation, for a better trade-off between precision and model performance. Importantly, our findings suggest that the value cache of VideoLLMs should be quantized in a per-channel fashion instead of the per-token fashion proposed by prior KV cache quantization works for LLMs. Empirically, extensive results with LLaVA-OV-7B and Qwen2.5-VL-7B on six benchmarks show that VidKV effectively compresses the KV cache to 1.5-bit and 1.58-bit precision with almost no performance drop compared to the FP16 counterparts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16257v1-abstract-full').style.display = 'none'; document.getElementById('2503.16257v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16112">arXiv:2503.16112</a> <span> [<a href="https://arxiv.org/pdf/2503.16112">pdf</a>, <a href="https://arxiv.org/format/2503.16112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> PromptMobile: Efficient Promptus for Low Bandwidth Mobile Video Streaming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liming Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiangkai Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinggong Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zongming Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16112v1-abstract-short" style="display: inline;"> Traditional video compression algorithms exhibit significant quality degradation at extremely low bitrates. Promptus emerges as a new paradigm for video streaming, substantially cutting down the bandwidth essential for video streaming. However, Promptus is computationally intensive and can not run in real-time on mobile devices. This paper presents PromptMobile, an efficient acceleration framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16112v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16112v1-abstract-full" style="display: none;"> Traditional video compression algorithms exhibit significant quality degradation at extremely low bitrates. Promptus emerges as a new paradigm for video streaming, substantially cutting down the bandwidth essential for video streaming. However, Promptus is computationally intensive and can not run in real-time on mobile devices. This paper presents PromptMobile, an efficient acceleration framework tailored for on-device Promptus. Specifically, we propose (1) a two-stage efficient generation framework to reduce computational cost by 8.1x, (2) a fine-grained inter-frame caching to reduce redundant computations by 16.6\%, (3) system-level optimizations to further enhance efficiency. The evaluations demonstrate that compared with the original Promptus, PromptMobile achieves a 13.6x increase in image generation speed. Compared with other streaming methods, PromptMobile achives an average LPIPS improvement of 0.016 (compared with H.265), reducing 60\% of severely distorted frames (compared to VQGAN). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16112v1-abstract-full').style.display = 'none'; document.getElementById('2503.16112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.16055">arXiv:2503.16055</a> <span> [<a href="https://arxiv.org/pdf/2503.16055">pdf</a>, <a href="https://arxiv.org/format/2503.16055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SALT: Singular Value Adaptation with Low-Rank Transformation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Elsayed%2C+A">Abdelrahman Elsayed</a>, <a href="/search/cs?searchtype=author&query=Hashmi%2C+S">Sarim Hashmi</a>, <a href="/search/cs?searchtype=author&query=Elseiagy%2C+M">Mohammed Elseiagy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hu Wang</a>, <a href="/search/cs?searchtype=author&query=Yaqub%2C+M">Mohammad Yaqub</a>, <a href="/search/cs?searchtype=author&query=Almakky%2C+I">Ibrahim Almakky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.16055v1-abstract-short" style="display: inline;"> The complex nature of medical image segmentation calls for models that are specifically designed to capture detailed, domain-specific features. Large foundation models offer considerable flexibility, yet the cost of fine-tuning these models remains a significant barrier. Parameter-Efficient Fine-Tuning (PEFT) methods, such as Low-Rank Adaptation (LoRA), efficiently update model weights with low-ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16055v1-abstract-full').style.display = 'inline'; document.getElementById('2503.16055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.16055v1-abstract-full" style="display: none;"> The complex nature of medical image segmentation calls for models that are specifically designed to capture detailed, domain-specific features. Large foundation models offer considerable flexibility, yet the cost of fine-tuning these models remains a significant barrier. Parameter-Efficient Fine-Tuning (PEFT) methods, such as Low-Rank Adaptation (LoRA), efficiently update model weights with low-rank matrices but may suffer from underfitting when the chosen rank is insufficient to capture domain-specific nuances. Conversely, full-rank Singular Value Decomposition (SVD) based methods provide comprehensive updates by modifying all singular values, yet they often lack flexibility and exhibit variable performance across datasets. We propose SALT (Singular Value Adaptation with Low-Rank Transformation), a method that selectively adapts the most influential singular values using trainable scale and shift parameters while complementing this with a low-rank update for the remaining subspace. This hybrid approach harnesses the advantages of both LoRA and SVD, enabling effective adaptation without relying on increasing model size or depth. Evaluated on 5 challenging medical datasets, ranging from as few as 20 samples to 1000, SALT outperforms state-of-the-art PEFT (LoRA and SVD) by 2% to 5% in Dice with only 3.9% trainable parameters, demonstrating robust adaptation even in low-resource settings. The code for SALT is available at: https://github.com/BioMedIA-MBZUAI/SALT <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.16055v1-abstract-full').style.display = 'none'; document.getElementById('2503.16055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15887">arXiv:2503.15887</a> <span> [<a href="https://arxiv.org/pdf/2503.15887">pdf</a>, <a href="https://arxiv.org/format/2503.15887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DocVideoQA: Towards Comprehensive Understanding of Document-Centric Videos through Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haochen Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Liangcai Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15887v1-abstract-short" style="display: inline;"> Remote work and online courses have become important methods of knowledge dissemination, leading to a large number of document-based instructional videos. Unlike traditional video datasets, these videos mainly feature rich-text images and audio that are densely packed with information closely tied to the visual content, requiring advanced multimodal understanding capabilities. However, this domain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15887v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15887v1-abstract-full" style="display: none;"> Remote work and online courses have become important methods of knowledge dissemination, leading to a large number of document-based instructional videos. Unlike traditional video datasets, these videos mainly feature rich-text images and audio that are densely packed with information closely tied to the visual content, requiring advanced multimodal understanding capabilities. However, this domain remains underexplored due to dataset availability and its inherent complexity. In this paper, we introduce the DocVideoQA task and dataset for the first time, comprising 1454 videos across 23 categories with a total duration of about 828 hours. The dataset is annotated with 154k question-answer pairs generated manually and via GPT, assessing models' comprehension, temporal awareness, and modality integration capabilities. Initially, we establish a baseline using open-source MLLMs. Recognizing the challenges in modality comprehension for document-centric videos, we present DV-LLaMA, a robust video MLLM baseline. Our method enhances unimodal feature extraction with diverse instruction-tuning data and employs contrastive learning to strengthen modality integration. Through fine-tuning, the LLM is equipped with audio-visual capabilities, leading to significant improvements in document-centric video understanding. Extensive testing on the DocVideoQA dataset shows that DV-LLaMA significantly outperforms existing models. We'll release the code and dataset to facilitate future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15887v1-abstract-full').style.display = 'none'; document.getElementById('2503.15887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15875">arXiv:2503.15875</a> <span> [<a href="https://arxiv.org/pdf/2503.15875">pdf</a>, <a href="https://arxiv.org/format/2503.15875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MiLA: Multi-view Intensive-fidelity Long-term Video Generation World Model for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haiguang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Daqi Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hongwei Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haisong Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+E">Enhui Ma</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kaicheng Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Limin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15875v1-abstract-short" style="display: inline;"> In recent years, data-driven techniques have greatly advanced autonomous driving systems, but the need for rare and diverse training data remains a challenge, requiring significant investment in equipment and labor. World models, which predict and generate future environmental states, offer a promising solution by synthesizing annotated video data for training. However, existing methods struggle t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15875v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15875v1-abstract-full" style="display: none;"> In recent years, data-driven techniques have greatly advanced autonomous driving systems, but the need for rare and diverse training data remains a challenge, requiring significant investment in equipment and labor. World models, which predict and generate future environmental states, offer a promising solution by synthesizing annotated video data for training. However, existing methods struggle to generate long, consistent videos without accumulating errors, especially in dynamic scenes. To address this, we propose MiLA, a novel framework for generating high-fidelity, long-duration videos up to one minute. MiLA utilizes a Coarse-to-Re(fine) approach to both stabilize video generation and correct distortion of dynamic objects. Additionally, we introduce a Temporal Progressive Denoising Scheduler and Joint Denoising and Correcting Flow modules to improve the quality of generated videos. Extensive experiments on the nuScenes dataset show that MiLA achieves state-of-the-art performance in video generation quality. For more information, visit the project website: https://github.com/xiaomi-mlab/mila.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15875v1-abstract-full').style.display = 'none'; document.getElementById('2503.15875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project website: https://github.com/xiaomi-mlab/mila.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15579">arXiv:2503.15579</a> <span> [<a href="https://arxiv.org/pdf/2503.15579">pdf</a>, <a href="https://arxiv.org/format/2503.15579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Generalization of In-Context Learning in Transformers: An Empirical Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiansheng Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yuan Xue</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+S">Shikai Guan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Renzhe Xu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Hao Zou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Han Yu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+P">Peng Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15579v1-abstract-short" style="display: inline;"> Large language models (LLMs) like GPT-4 and LLaMA-3 utilize the powerful in-context learning (ICL) capability of Transformer architecture to learn on the fly from limited examples. While ICL underpins many LLM applications, its full potential remains hindered by a limited understanding of its generalization boundaries and vulnerabilities. We present a systematic investigation of transformers' gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15579v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15579v1-abstract-full" style="display: none;"> Large language models (LLMs) like GPT-4 and LLaMA-3 utilize the powerful in-context learning (ICL) capability of Transformer architecture to learn on the fly from limited examples. While ICL underpins many LLM applications, its full potential remains hindered by a limited understanding of its generalization boundaries and vulnerabilities. We present a systematic investigation of transformers' generalization capability with ICL relative to training data coverage by defining a task-centric framework along three dimensions: inter-problem, intra-problem, and intra-task generalization. Through extensive simulation and real-world experiments, encompassing tasks such as function fitting, API calling, and translation, we find that transformers lack inter-problem generalization with ICL, but excel in intra-task and intra-problem generalization. When the training data includes a greater variety of mixed tasks, it significantly enhances the generalization ability of ICL on unseen tasks and even on known simple tasks. This guides us in designing training data to maximize the diversity of tasks covered and to combine different tasks whenever possible, rather than solely focusing on the target task for testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15579v1-abstract-full').style.display = 'none'; document.getElementById('2503.15579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15558">arXiv:2503.15558</a> <span> [<a href="https://arxiv.org/pdf/2503.15558">pdf</a>, <a href="https://arxiv.org/format/2503.15558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Azzolini%2C+A">Alisson Azzolini</a>, <a href="/search/cs?searchtype=author&query=Brandon%2C+H">Hannah Brandon</a>, <a href="/search/cs?searchtype=author&query=Chattopadhyay%2C+P">Prithvijit Chattopadhyay</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+J">Jinju Chu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&query=Diamond%2C+J">Jenna Diamond</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Ferroni%2C+F">Francesco Ferroni</a>, <a href="/search/cs?searchtype=author&query=Govindaraju%2C+R">Rama Govindaraju</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&query=Gururani%2C+S">Siddharth Gururani</a>, <a href="/search/cs?searchtype=author&query=Hanafi%2C+I+E">Imad El Hanafi</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zekun Hao</a>, <a href="/search/cs?searchtype=author&query=Huffman%2C+J">Jacob Huffman</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jingyi Jin</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+B">Brendan Johnson</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+R">Rizwan Khan</a>, <a href="/search/cs?searchtype=author&query=Kurian%2C+G">George Kurian</a>, <a href="/search/cs?searchtype=author&query=Lantz%2C+E">Elena Lantz</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+N">Nayeon Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuan Li</a> , et al. (21 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15558v1-abstract-short" style="display: inline;"> Physical AI systems need to perceive, understand, and perform complex actions in the physical world. In this paper, we present the Cosmos-Reason1 models that can understand the physical world and generate appropriate embodied decisions (e.g., next step action) in natural language through long chain-of-thought reasoning processes. We begin by defining key capabilities for Physical AI reasoning, wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15558v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15558v1-abstract-full" style="display: none;"> Physical AI systems need to perceive, understand, and perform complex actions in the physical world. In this paper, we present the Cosmos-Reason1 models that can understand the physical world and generate appropriate embodied decisions (e.g., next step action) in natural language through long chain-of-thought reasoning processes. We begin by defining key capabilities for Physical AI reasoning, with a focus on physical common sense and embodied reasoning. To represent physical common sense, we use a hierarchical ontology that captures fundamental knowledge about space, time, and physics. For embodied reasoning, we rely on a two-dimensional ontology that generalizes across different physical embodiments. Building on these capabilities, we develop two multimodal large language models, Cosmos-Reason1-8B and Cosmos-Reason1-56B. We curate data and train our models in four stages: vision pre-training, general supervised fine-tuning (SFT), Physical AI SFT, and Physical AI reinforcement learning (RL) as the post-training. To evaluate our models, we build comprehensive benchmarks for physical common sense and embodied reasoning according to our ontologies. Evaluation results show that Physical AI SFT and reinforcement learning bring significant improvements. To facilitate the development of Physical AI, we will make our code and pre-trained models available under the NVIDIA Open Model License at https://github.com/nvidia-cosmos/cosmos-reason1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15558v1-abstract-full').style.display = 'none'; document.getElementById('2503.15558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15450">arXiv:2503.15450</a> <span> [<a href="https://arxiv.org/pdf/2503.15450">pdf</a>, <a href="https://arxiv.org/format/2503.15450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SkyLadder: Better and Faster Pretraining via Context Window Scheduling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tongyao Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haonan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiangming Gu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+T">Tianyu Pang</a>, <a href="/search/cs?searchtype=author&query=Kan%2C+M">Min-Yen Kan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15450v1-abstract-short" style="display: inline;"> Recent advancements in LLM pretraining have featured ever-expanding context windows to process longer sequences. However, our pilot study reveals that models pretrained with shorter context windows consistently outperform their long-context counterparts under a fixed token budget. This finding motivates us to explore an optimal context window scheduling strategy to better balance long-context capa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15450v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15450v1-abstract-full" style="display: none;"> Recent advancements in LLM pretraining have featured ever-expanding context windows to process longer sequences. However, our pilot study reveals that models pretrained with shorter context windows consistently outperform their long-context counterparts under a fixed token budget. This finding motivates us to explore an optimal context window scheduling strategy to better balance long-context capability with pretraining efficiency. To this end, we propose SkyLadder, a simple yet effective approach that implements a short-to-long context window transition. SkyLadder preserves strong standard benchmark performance, while matching or exceeding baseline results on long context tasks. Through extensive experiments, we pre-train 1B-parameter models (up to 32K context) and 3B-parameter models (8K context) on 100B tokens, demonstrating that SkyLadder yields consistent gains of up to 3.7% on common benchmarks, while achieving up to 22% faster training speeds compared to baselines. The code is at https://github.com/sail-sg/SkyLadder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15450v1-abstract-full').style.display = 'none'; document.getElementById('2503.15450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages. Accepted to ICLR 2025 Workshop on Open Science for Foundation Models</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15277">arXiv:2503.15277</a> <span> [<a href="https://arxiv.org/pdf/2503.15277">pdf</a>, <a href="https://arxiv.org/format/2503.15277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664811">10.1145/3664811 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> What Makes a Good TODO Comment? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoye Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhipeng Gao</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+T">Tingting Bi</a>, <a href="/search/cs?searchtype=author&query=Grundy%2C+J">John Grundy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Minghui Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaohu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15277v1-abstract-short" style="display: inline;"> Software development is a collaborative process that involves various interactions among individuals and teams. TODO comments in source code play a critical role in managing and coordinating diverse tasks during this process. However, this study finds that a large proportion of open-source project TODO comments are left unresolved or take a long time to be resolved. About 46.7\% of TODO comments i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15277v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15277v1-abstract-full" style="display: none;"> Software development is a collaborative process that involves various interactions among individuals and teams. TODO comments in source code play a critical role in managing and coordinating diverse tasks during this process. However, this study finds that a large proportion of open-source project TODO comments are left unresolved or take a long time to be resolved. About 46.7\% of TODO comments in open-source repositories are of low-quality (e.g., TODOs that are ambiguous, lack information, or are useless to developers). This highlights the need for better TODO practices. In this study, we investigate four aspects regarding the quality of TODO comments in open-source projects: (1) the prevalence of low-quality TODO comments; (2) the key characteristics of high-quality TODO comments; (3) how are TODO comments of different quality managed in practice; and (4) the feasibility of automatically assessing TODO comment quality. Examining 2,863 TODO comments from Top100 GitHub Java repositories, we propose criteria to identify high-quality TODO comments and provide insights into their optimal composition. We discuss the lifecycle of TODO comments with varying quality. we construct deep learning-based methods that show promising performance in identifying the quality of TODO comments, potentially enhancing development efficiency and code quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15277v1-abstract-full').style.display = 'none'; document.getElementById('2503.15277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15091">arXiv:2503.15091</a> <span> [<a href="https://arxiv.org/pdf/2503.15091">pdf</a>, <a href="https://arxiv.org/format/2503.15091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Intelligent Spatial Perception by Building Hierarchical 3D Scene Graphs for Indoor Scenarios with the Help of LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yao Cheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhe Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fengyang Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huaizhen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fengyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingshan Yin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+L">Lei Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15091v1-abstract-short" style="display: inline;"> This paper addresses the high demand in advanced intelligent robot navigation for a more holistic understanding of spatial environments, by introducing a novel system that harnesses the capabilities of Large Language Models (LLMs) to construct hierarchical 3D Scene Graphs (3DSGs) for indoor scenarios. The proposed framework constructs 3DSGs consisting of a fundamental layer with rich metric-semant… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15091v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15091v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15091v1-abstract-full" style="display: none;"> This paper addresses the high demand in advanced intelligent robot navigation for a more holistic understanding of spatial environments, by introducing a novel system that harnesses the capabilities of Large Language Models (LLMs) to construct hierarchical 3D Scene Graphs (3DSGs) for indoor scenarios. The proposed framework constructs 3DSGs consisting of a fundamental layer with rich metric-semantic information, an object layer featuring precise point-cloud representation of object nodes as well as visual descriptors, and higher layers of room, floor, and building nodes. Thanks to the innovative application of LLMs, not only object nodes but also nodes of higher layers, e.g., room nodes, are annotated in an intelligent and accurate manner. A polling mechanism for room classification using LLMs is proposed to enhance the accuracy and reliability of the room node annotation. Thorough numerical experiments demonstrate the system's ability to integrate semantic descriptions with geometric data, creating an accurate and comprehensive representation of the environment instrumental for context-aware navigation and task planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15091v1-abstract-full').style.display = 'none'; document.getElementById('2503.15091v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by WRC SARA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14863">arXiv:2503.14863</a> <span> [<a href="https://arxiv.org/pdf/2503.14863">pdf</a>, <a href="https://arxiv.org/format/2503.14863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Temporal-Consistent Video Restoration with Pre-trained Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hengkang Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huidong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chien-Chih Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanhui Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongdong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bryan Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Ju Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14863v1-abstract-short" style="display: inline;"> Video restoration (VR) aims to recover high-quality videos from degraded ones. Although recent zero-shot VR methods using pre-trained diffusion models (DMs) show good promise, they suffer from approximation errors during reverse diffusion and insufficient temporal consistency. Moreover, dealing with 3D video data, VR is inherently computationally intensive. In this paper, we advocate viewing the r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14863v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14863v1-abstract-full" style="display: none;"> Video restoration (VR) aims to recover high-quality videos from degraded ones. Although recent zero-shot VR methods using pre-trained diffusion models (DMs) show good promise, they suffer from approximation errors during reverse diffusion and insufficient temporal consistency. Moreover, dealing with 3D video data, VR is inherently computationally intensive. In this paper, we advocate viewing the reverse process in DMs as a function and present a novel Maximum a Posterior (MAP) framework that directly parameterizes video frames in the seed space of DMs, eliminating approximation errors. We also introduce strategies to promote bilevel temporal consistency: semantic consistency by leveraging clustering structures in the seed space, and pixel-level consistency by progressive warping with optical flow refinements. Extensive experiments on multiple virtual reality tasks demonstrate superior visual quality and temporal consistency achieved by our method compared to the state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14863v1-abstract-full').style.display = 'none'; document.getElementById('2503.14863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14559">arXiv:2503.14559</a> <span> [<a href="https://arxiv.org/pdf/2503.14559">pdf</a>, <a href="https://arxiv.org/format/2503.14559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Squeeze Out Tokens from Sample for Finer-Grained Data Governance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weixiong Lin</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+C">Chen Ju</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengchao Hu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuai Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mengting Chen</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yuheng Jiao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Mingshuai Yao</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Jinsong Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14559v1-abstract-short" style="display: inline;"> Widely observed data scaling laws, in which error falls off as a power of the training size, demonstrate the diminishing returns of unselective data expansion. Hence, data governance is proposed to downsize datasets through pruning non-informative samples. Yet, isolating the impact of a specific sample on overall model performance is challenging, due to the vast computation required for tryout all… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14559v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14559v1-abstract-full" style="display: none;"> Widely observed data scaling laws, in which error falls off as a power of the training size, demonstrate the diminishing returns of unselective data expansion. Hence, data governance is proposed to downsize datasets through pruning non-informative samples. Yet, isolating the impact of a specific sample on overall model performance is challenging, due to the vast computation required for tryout all sample combinations. Current data governors circumvent this complexity by estimating sample contributions through heuristic-derived scalar scores, thereby discarding low-value ones. Despite thorough sample sieving, retained samples contain substantial undesired tokens intrinsically, underscoring the potential for further compression and purification. In this work, we upgrade data governance from a 'sieving' approach to a 'juicing' one. Instead of scanning for least-flawed samples, our dual-branch DataJuicer applies finer-grained intra-sample governance. It squeezes out informative tokens and boosts image-text alignments. Specifically, the vision branch retains salient image patches and extracts relevant object classes, while the text branch incorporates these classes to enhance captions. Consequently, DataJuicer yields more refined datasets through finer-grained governance. Extensive experiments across datasets demonstrate that DataJuicer significantly outperforms existing DataSieve in image-text retrieval, classification, and dense visual reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14559v1-abstract-full').style.display = 'none'; document.getElementById('2503.14559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.14535">arXiv:2503.14535</a> <span> [<a href="https://arxiv.org/pdf/2503.14535">pdf</a>, <a href="https://arxiv.org/format/2503.14535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Unsupervised Joint Denoising and Enhancement for Real-World low-light Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Huaqiu Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaowan Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.14535v1-abstract-short" style="display: inline;"> Real-world low-light images often suffer from complex degradations such as local overexposure, low brightness, noise, and uneven illumination. Supervised methods tend to overfit to specific scenarios, while unsupervised methods, though better at generalization, struggle to model these degradations due to the lack of reference images. To address this issue, we propose an interpretable, zero-referen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14535v1-abstract-full').style.display = 'inline'; document.getElementById('2503.14535v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.14535v1-abstract-full" style="display: none;"> Real-world low-light images often suffer from complex degradations such as local overexposure, low brightness, noise, and uneven illumination. Supervised methods tend to overfit to specific scenarios, while unsupervised methods, though better at generalization, struggle to model these degradations due to the lack of reference images. To address this issue, we propose an interpretable, zero-reference joint denoising and low-light enhancement framework tailored for real-world scenarios. Our method derives a training strategy based on paired sub-images with varying illumination and noise levels, grounded in physical imaging principles and retinex theory. Additionally, we leverage the Discrete Cosine Transform (DCT) to perform frequency domain decomposition in the sRGB space, and introduce an implicit-guided hybrid representation strategy that effectively separates intricate compounded degradations. In the backbone network design, we develop retinal decomposition network guided by implicit degradation representation mechanisms. Extensive experiments demonstrate the superiority of our method. Code will be available at https://github.com/huaqlili/unsupervised-light-enhance-ICLR2025. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.14535v1-abstract-full').style.display = 'none'; document.getElementById('2503.14535v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Wang%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Wang%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>