Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 6,433 results for author: <span class="mathjax">Li, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Li%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10362">arXiv:2502.10362</a> <span> [<a href="https://arxiv.org/pdf/2502.10362">pdf</a>, <a href="https://arxiv.org/format/2502.10362">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> CLaMP 3: Universal Music Information Retrieval Across Unaligned Modalities and Unseen Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhancheng Guo</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Doh%2C+S">Seungheon Doh</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Gus Xia</a>, <a href="/search/cs?searchtype=author&query=Nam%2C+J">Juhan Nam</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaobing Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10362v1-abstract-short" style="display: inline;"> CLaMP 3 is a unified framework developed to address challenges of cross-modal and cross-lingual generalization in music information retrieval. Using contrastive learning, it aligns all major music modalities--including sheet music, performance signals, and audio recordings--with multilingual text in a shared representation space, enabling retrieval across unaligned modalities with text as a bridge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10362v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10362v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10362v1-abstract-full" style="display: none;"> CLaMP 3 is a unified framework developed to address challenges of cross-modal and cross-lingual generalization in music information retrieval. Using contrastive learning, it aligns all major music modalities--including sheet music, performance signals, and audio recordings--with multilingual text in a shared representation space, enabling retrieval across unaligned modalities with text as a bridge. It features a multilingual text encoder adaptable to unseen languages, exhibiting strong cross-lingual generalization. Leveraging retrieval-augmented generation, we curated M4-RAG, a web-scale dataset consisting of 2.31 million music-text pairs. This dataset is enriched with detailed metadata that represents a wide array of global musical traditions. To advance future research, we release WikiMT-X, a benchmark comprising 1,000 triplets of sheet music, audio, and richly varied text descriptions. Experiments show that CLaMP 3 achieves state-of-the-art performance on multiple MIR tasks, significantly surpassing previous strong baselines and demonstrating excellent generalization in multimodal and multilingual music contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10362v1-abstract-full').style.display = 'none'; document.getElementById('2502.10362v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 8 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10145">arXiv:2502.10145</a> <span> [<a href="https://arxiv.org/pdf/2502.10145">pdf</a>, <a href="https://arxiv.org/format/2502.10145">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Concept-based Deep Learning Framework for Multimodal Human Behavior Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyu Li</a>, <a href="/search/cs?searchtype=author&query=Mahmoud%2C+M">Marwa Mahmoud</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10145v1-abstract-short" style="display: inline;"> In the contemporary era of intelligent connectivity, Affective Computing (AC), which enables systems to recognize, interpret, and respond to human behavior states, has become an integrated part of many AI systems. As one of the most critical components of responsible AI and trustworthiness in all human-centered systems, explainability has been a major concern in AC. Particularly, the recently rele… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10145v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10145v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10145v1-abstract-full" style="display: none;"> In the contemporary era of intelligent connectivity, Affective Computing (AC), which enables systems to recognize, interpret, and respond to human behavior states, has become an integrated part of many AI systems. As one of the most critical components of responsible AI and trustworthiness in all human-centered systems, explainability has been a major concern in AC. Particularly, the recently released EU General Data Protection Regulation requires any high-risk AI systems to be sufficiently interpretable, including biometric-based systems and emotion recognition systems widely used in the affective computing field. Existing explainable methods often compromise between interpretability and performance. Most of them focus only on highlighting key network parameters without offering meaningful, domain-specific explanations to the stakeholders. Additionally, they also face challenges in effectively co-learning and explaining insights from multimodal data sources. To address these limitations, we propose a novel and generalizable framework, namely the Attention-Guided Concept Model (AGCM), which provides learnable conceptual explanations by identifying what concepts that lead to the predictions and where they are observed. AGCM is extendable to any spatial and temporal signals through multimodal concept alignment and co-learning, empowering stakeholders with deeper insights into the model's decision-making process. We validate the efficiency of AGCM on well-established Facial Expression Recognition benchmark datasets while also demonstrating its generalizability on more complex real-world human behavior understanding applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10145v1-abstract-full').style.display = 'none'; document.getElementById('2502.10145v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10095">arXiv:2502.10095</a> <span> [<a href="https://arxiv.org/pdf/2502.10095">pdf</a>, <a href="https://arxiv.org/format/2502.10095">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Representation Learning on Out of Distribution in Tabular Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ginanjar%2C+A">Achmad Ginanjar</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xue Li</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+P">Priyanka Singh</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+W">Wen Hua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10095v1-abstract-short" style="display: inline;"> The open-world assumption in model development suggests that a model might lack sufficient information to adequately handle data that is entirely distinct or out of distribution (OOD). While deep learning methods have shown promising results in handling OOD data through generalization techniques, they often require specialized hardware that may not be accessible to all users. We present TCL, a lig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10095v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10095v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10095v1-abstract-full" style="display: none;"> The open-world assumption in model development suggests that a model might lack sufficient information to adequately handle data that is entirely distinct or out of distribution (OOD). While deep learning methods have shown promising results in handling OOD data through generalization techniques, they often require specialized hardware that may not be accessible to all users. We present TCL, a lightweight yet effective solution that operates efficiently on standard CPU hardware. Our approach adapts contrastive learning principles specifically for tabular data structures, incorporating full matrix augmentation and simplified loss calculation. Through comprehensive experiments across 10 diverse datasets, we demonstrate that TCL outperforms existing models, including FT-Transformer and ResNet, particularly in classification tasks, while maintaining competitive performance in regression problems. TCL achieves these results with significantly reduced computational requirements, making it accessible to users with limited hardware capabilities. This study also provides practical guidance for detecting and evaluating OOD data through straightforward experiments and visualizations. Our findings show that TCL offers a promising balance between performance and efficiency in handling OOD prediction tasks, which is particularly beneficial for general machine learning practitioners working with computational constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10095v1-abstract-full').style.display = 'none'; document.getElementById('2502.10095v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10059">arXiv:2502.10059</a> <span> [<a href="https://arxiv.org/pdf/2502.10059">pdf</a>, <a href="https://arxiv.org/format/2502.10059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guangcong Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Rui Jiang</a>, <a href="/search/cs?searchtype=author&query=Shuigenzhan"> Shuigenzhan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yehao Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yining Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10059v1-abstract-short" style="display: inline;"> Recent advancements in camera-trajectory-guided image-to-video generation offer higher precision and better support for complex camera control compared to text-based approaches. However, they also introduce significant usability challenges, as users often struggle to provide precise camera parameters when working with arbitrary real-world images without knowledge of their depth nor scene scale. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10059v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10059v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10059v1-abstract-full" style="display: none;"> Recent advancements in camera-trajectory-guided image-to-video generation offer higher precision and better support for complex camera control compared to text-based approaches. However, they also introduce significant usability challenges, as users often struggle to provide precise camera parameters when working with arbitrary real-world images without knowledge of their depth nor scene scale. To address these real-world application issues, we propose RealCam-I2V, a novel diffusion-based video generation framework that integrates monocular metric depth estimation to establish 3D scene reconstruction in a preprocessing step. During training, the reconstructed 3D scene enables scaling camera parameters from relative to absolute values, ensuring compatibility and scale consistency across diverse real-world images. In inference, RealCam-I2V offers an intuitive interface where users can precisely draw camera trajectories by dragging within the 3D scene. To further enhance precise camera control and scene consistency, we propose scene-constrained noise shaping, which shapes high-level noise and also allows the framework to maintain dynamic, coherent video generation in lower noise stages. RealCam-I2V achieves significant improvements in controllability and video quality on the RealEstate10K and out-of-domain images. We further enables applications like camera-controlled looping video generation and generative frame interpolation. We will release our absolute-scale annotation, codes, and all checkpoints. Please see dynamic results in https://zgctroy.github.io/RealCam-I2V. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10059v1-abstract-full').style.display = 'none'; document.getElementById('2502.10059v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09906">arXiv:2502.09906</a> <span> [<a href="https://arxiv.org/pdf/2502.09906">pdf</a>, <a href="https://arxiv.org/format/2502.09906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Insect-Foundation: A Foundation Model and Large Multimodal Dataset for Vision-Language Insect Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Truong%2C+T">Thanh-Dat Truong</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Hoang-Quan Nguyen</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+X">Xuan-Bac Nguyen</a>, <a href="/search/cs?searchtype=author&query=Dowling%2C+A">Ashley Dowling</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Luu%2C+K">Khoa Luu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09906v1-abstract-short" style="display: inline;"> Multimodal conversational generative AI has shown impressive capabilities in various vision and language understanding through learning massive text-image data. However, current conversational models still lack knowledge about visual insects since they are often trained on the general knowledge of vision-language data. Meanwhile, understanding insects is a fundamental problem in precision agricult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09906v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09906v1-abstract-full" style="display: none;"> Multimodal conversational generative AI has shown impressive capabilities in various vision and language understanding through learning massive text-image data. However, current conversational models still lack knowledge about visual insects since they are often trained on the general knowledge of vision-language data. Meanwhile, understanding insects is a fundamental problem in precision agriculture, helping to promote sustainable development in agriculture. Therefore, this paper proposes a novel multimodal conversational model, Insect-LLaVA, to promote visual understanding in insect-domain knowledge. In particular, we first introduce a new large-scale Multimodal Insect Dataset with Visual Insect Instruction Data that enables the capability of learning the multimodal foundation models. Our proposed dataset enables conversational models to comprehend the visual and semantic features of the insects. Second, we propose a new Insect-LLaVA model, a new general Large Language and Vision Assistant in Visual Insect Understanding. Then, to enhance the capability of learning insect features, we develop an Insect Foundation Model by introducing a new micro-feature self-supervised learning with a Patch-wise Relevant Attention mechanism to capture the subtle differences among insect images. We also present Description Consistency loss to improve micro-feature learning via text descriptions. The experimental results evaluated on our new Visual Insect Question Answering benchmarks illustrate the effective performance of our proposed approach in visual insect understanding and achieve State-of-the-Art performance on standard benchmarks of insect-related tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09906v1-abstract-full').style.display = 'none'; document.getElementById('2502.09906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09866">arXiv:2502.09866</a> <span> [<a href="https://arxiv.org/pdf/2502.09866">pdf</a>, <a href="https://arxiv.org/format/2502.09866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How Users Who are Blind or Low Vision Play Mobile Games: Perceptions, Challenges, and Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ran%2C+Z">Zihe Ran</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qing Xiao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xianzhe Fan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F+M">Franklin Mingzhe Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanyun Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhicong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09866v1-abstract-short" style="display: inline;"> As blind and low-vision (BLV) players engage more deeply with games, accessibility features have become essential. While some research has explored tools and strategies to enhance game accessibility, the specific experiences of these players with mobile games remain underexamined. This study addresses this gap by investigating how BLV users experience mobile games with varying accessibility levels… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09866v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09866v1-abstract-full" style="display: none;"> As blind and low-vision (BLV) players engage more deeply with games, accessibility features have become essential. While some research has explored tools and strategies to enhance game accessibility, the specific experiences of these players with mobile games remain underexamined. This study addresses this gap by investigating how BLV users experience mobile games with varying accessibility levels. Through interviews with 32 experienced BLV mobile players, we explore their perceptions, challenges, and strategies for engaging with mobile games. Our findings reveal that BLV players turn to mobile games to alleviate boredom, achieve a sense of accomplishment, and build social connections, but face barriers depending on the game's accessibility level. We also compare mobile games to other forms of gaming, highlighting the relative advantages of mobile games, such as the inherent accessibility of smartphones. This study contributes to understanding BLV mobile gaming experiences and provides insights for enhancing accessible mobile game design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09866v1-abstract-full').style.display = 'none'; document.getElementById('2502.09866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 3 figures, Accepted by CHI '25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09620">arXiv:2502.09620</a> <span> [<a href="https://arxiv.org/pdf/2502.09620">pdf</a>, <a href="https://arxiv.org/format/2502.09620">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Potential of Encoder-free Architectures in 3D LMMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yiwen Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zoey Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ray Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qizhi Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junli Liu</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+D">Delin Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhigang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09620v1-abstract-short" style="display: inline;"> Encoder-free architectures have been preliminarily explored in the 2D visual domain, yet it remains an open question whether they can be effectively applied to 3D understanding scenarios. In this paper, we present the first comprehensive investigation into the potential of encoder-free architectures to overcome the challenges of encoder-based 3D Large Multimodal Models (LMMs). These challenges inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09620v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09620v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09620v1-abstract-full" style="display: none;"> Encoder-free architectures have been preliminarily explored in the 2D visual domain, yet it remains an open question whether they can be effectively applied to 3D understanding scenarios. In this paper, we present the first comprehensive investigation into the potential of encoder-free architectures to overcome the challenges of encoder-based 3D Large Multimodal Models (LMMs). These challenges include the failure to adapt to varying point cloud resolutions and the point features from the encoder not meeting the semantic needs of Large Language Models (LLMs). We identify key aspects for 3D LMMs to remove the encoder and enable the LLM to assume the role of the 3D encoder: 1) We propose the LLM-embedded Semantic Encoding strategy in the pre-training stage, exploring the effects of various point cloud self-supervised losses. And we present the Hybrid Semantic Loss to extract high-level semantics. 2) We introduce the Hierarchical Geometry Aggregation strategy in the instruction tuning stage. This incorporates inductive bias into the LLM early layers to focus on the local details of the point clouds. To the end, we present the first Encoder-free 3D LMM, ENEL. Our 7B model rivals the current state-of-the-art model, ShapeLLM-13B, achieving 55.0%, 50.92%, and 42.7% on the classification, captioning, and VQA tasks, respectively. Our results demonstrate that the encoder-free architecture is highly promising for replacing encoder-based architectures in the field of 3D understanding. The code is released at https://github.com/Ivan-Tang-3D/ENEL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09620v1-abstract-full').style.display = 'none'; document.getElementById('2502.09620v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code is released at https://github.com/Ivan-Tang-3D/ENEL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09497">arXiv:2502.09497</a> <span> [<a href="https://arxiv.org/pdf/2502.09497">pdf</a>, <a href="https://arxiv.org/ps/2502.09497">ps</a>, <a href="https://arxiv.org/format/2502.09497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improve LLM-based Automatic Essay Scoring with Linguistic Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hou%2C+Z+J">Zhaoyi Joey Hou</a>, <a href="/search/cs?searchtype=author&query=Ciuba%2C+A">Alejandro Ciuba</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X+L">Xiang Lorraine Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09497v1-abstract-short" style="display: inline;"> Automatic Essay Scoring (AES) assigns scores to student essays, reducing the grading workload for instructors. Developing a scoring system capable of handling essays across diverse prompts is challenging due to the flexibility and diverse nature of the writing task. Existing methods typically fall into two categories: supervised feature-based approaches and large language model (LLM)-based methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09497v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09497v1-abstract-full" style="display: none;"> Automatic Essay Scoring (AES) assigns scores to student essays, reducing the grading workload for instructors. Developing a scoring system capable of handling essays across diverse prompts is challenging due to the flexibility and diverse nature of the writing task. Existing methods typically fall into two categories: supervised feature-based approaches and large language model (LLM)-based methods. Supervised feature-based approaches often achieve higher performance but require resource-intensive training. In contrast, LLM-based methods are computationally efficient during inference but tend to suffer from lower performance. This paper combines these approaches by incorporating linguistic features into LLM-based scoring. Experimental results show that this hybrid method outperforms baseline models for both in-domain and out-of-domain writing prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09497v1-abstract-full').style.display = 'none'; document.getElementById('2502.09497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in the workshop Innovation and Responsibility in AI-Supported Education (iRaise) at the 2025 Conference on Artificial Intelligence (AAAI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09473">arXiv:2502.09473</a> <span> [<a href="https://arxiv.org/pdf/2502.09473">pdf</a>, <a href="https://arxiv.org/format/2502.09473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Learning to Predict Global Atrial Fibrillation Dynamics from Sparse Measurements </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jenkins%2C+A">Alexander Jenkins</a>, <a href="/search/cs?searchtype=author&query=Cini%2C+A">Andrea Cini</a>, <a href="/search/cs?searchtype=author&query=Barker%2C+J">Joseph Barker</a>, <a href="/search/cs?searchtype=author&query=Sharp%2C+A">Alexander Sharp</a>, <a href="/search/cs?searchtype=author&query=Sau%2C+A">Arunashis Sau</a>, <a href="/search/cs?searchtype=author&query=Valentine%2C+V">Varun Valentine</a>, <a href="/search/cs?searchtype=author&query=Valasang%2C+S">Srushti Valasang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyang Li</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+T">Tom Wong</a>, <a href="/search/cs?searchtype=author&query=Betts%2C+T">Timothy Betts</a>, <a href="/search/cs?searchtype=author&query=Mandic%2C+D">Danilo Mandic</a>, <a href="/search/cs?searchtype=author&query=Alippi%2C+C">Cesare Alippi</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+F+S">Fu Siong Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09473v2-abstract-short" style="display: inline;"> Catheter ablation of Atrial Fibrillation (AF) consists of a one-size-fits-all treatment with limited success in persistent AF. This may be due to our inability to map the dynamics of AF with the limited resolution and coverage provided by sequential contact mapping catheters, preventing effective patient phenotyping for personalised, targeted ablation. Here we introduce FibMap, a graph recurrent n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09473v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09473v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09473v2-abstract-full" style="display: none;"> Catheter ablation of Atrial Fibrillation (AF) consists of a one-size-fits-all treatment with limited success in persistent AF. This may be due to our inability to map the dynamics of AF with the limited resolution and coverage provided by sequential contact mapping catheters, preventing effective patient phenotyping for personalised, targeted ablation. Here we introduce FibMap, a graph recurrent neural network model that reconstructs global AF dynamics from sparse measurements. Trained and validated on 51 non-contact whole atria recordings, FibMap reconstructs whole atria dynamics from 10% surface coverage, achieving a 210% lower mean absolute error and an order of magnitude higher performance in tracking phase singularities compared to baseline methods. Clinical utility of FibMap is demonstrated on real-world contact mapping recordings, achieving reconstruction fidelity comparable to non-contact mapping. FibMap's state-spaces and patient-specific parameters offer insights for electrophenotyping AF. Integrating FibMap into clinical practice could enable personalised AF care and improve outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09473v2-abstract-full').style.display = 'none'; document.getElementById('2502.09473v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09375">arXiv:2502.09375</a> <span> [<a href="https://arxiv.org/pdf/2502.09375">pdf</a>, <a href="https://arxiv.org/format/2502.09375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> FARM: Frequency-Aware Model for Cross-Domain Live-Streaming Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaodong Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruochen Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+S">Shuang Wen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shen Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yueyang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoquan Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Weisong Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Q">Qiang Luo</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+J">Jiawei Sheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuang Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaojie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09375v1-abstract-short" style="display: inline;"> Live-streaming services have attracted widespread popularity due to their real-time interactivity and entertainment value. Users can engage with live-streaming authors by participating in live chats, posting likes, or sending virtual gifts to convey their preferences and support. However, the live-streaming services faces serious data-sparsity problem, which can be attributed to the following two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09375v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09375v1-abstract-full" style="display: none;"> Live-streaming services have attracted widespread popularity due to their real-time interactivity and entertainment value. Users can engage with live-streaming authors by participating in live chats, posting likes, or sending virtual gifts to convey their preferences and support. However, the live-streaming services faces serious data-sparsity problem, which can be attributed to the following two points: (1) User's valuable behaviors are usually sparse, e.g., like, comment and gift, which are easily overlooked by the model, making it difficult to describe user's personalized preference. (2) The main exposure content on our platform is short-video, which is 9 times higher than the exposed live-streaming, leading to the inability of live-streaming content to fully model user preference. To this end, we propose a Frequency-Aware Model for Cross-Domain Live-Streaming Recommendation, termed as FARM. Specifically, we first present the intra-domain frequency aware module to enable our model to perceive user's sparse yet valuable behaviors, i.e., high-frequency information, supported by the Discrete Fourier Transform (DFT). To transfer user preference across the short-video and live-streaming domains, we propose a novel preference align before fuse strategy, which consists of two parts: the cross-domain preference align module to align user preference in both domains with contrastive learning, and the cross-domain preference fuse module to further fuse user preference in both domains using a serious of tailor-designed attention mechanisms. Extensive offline experiments and online A/B testing on Kuaishou live-streaming services demonstrate the effectiveness and superiority of FARM. Our FARM has been deployed in online live-streaming services and currently serves hundreds of millions of users on Kuaishou. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09375v1-abstract-full').style.display = 'none'; document.getElementById('2502.09375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09125">arXiv:2502.09125</a> <span> [<a href="https://arxiv.org/pdf/2502.09125">pdf</a>, <a href="https://arxiv.org/format/2502.09125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automatic Pruning via Structured Lasso with Class-wise Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingchen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xia Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zifan Peng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yijun Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zemin Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linshan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09125v1-abstract-short" style="display: inline;"> Most pruning methods concentrate on unimportant filters of neural networks. However, they face the loss of statistical information due to a lack of consideration for class-wise data. In this paper, from the perspective of leveraging precise class-wise information for model pruning, we utilize structured lasso with guidance from Information Bottleneck theory. Our approach ensures that statistical i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09125v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09125v1-abstract-full" style="display: none;"> Most pruning methods concentrate on unimportant filters of neural networks. However, they face the loss of statistical information due to a lack of consideration for class-wise data. In this paper, from the perspective of leveraging precise class-wise information for model pruning, we utilize structured lasso with guidance from Information Bottleneck theory. Our approach ensures that statistical information is retained during the pruning process. With these techniques, we introduce two innovative adaptive network pruning schemes: sparse graph-structured lasso pruning with Information Bottleneck (\textbf{sGLP-IB}) and sparse tree-guided lasso pruning with Information Bottleneck (\textbf{sTLP-IB}). The key aspect is pruning model filters using sGLP-IB and sTLP-IB to better capture class-wise relatedness. Compared to multiple state-of-the-art methods, our approaches demonstrate superior performance across three datasets and six model architectures in extensive experiments. For instance, using the VGG16 model on the CIFAR-10 dataset, we achieve a parameter reduction of 85%, a decrease in FLOPs by 61%, and maintain an accuracy of 94.10% (0.14% higher than the original model); we reduce the parameters by 55% with the accuracy at 76.12% using the ResNet architecture on ImageNet (only drops 0.03%). In summary, we successfully reduce model size and computational resource usage while maintaining accuracy. Our codes are at https://anonymous.4open.science/r/IJCAI-8104. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09125v1-abstract-full').style.display = 'none'; document.getElementById('2502.09125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09104">arXiv:2502.09104</a> <span> [<a href="https://arxiv.org/pdf/2502.09104">pdf</a>, <a href="https://arxiv.org/ps/2502.09104">ps</a>, <a href="https://arxiv.org/format/2502.09104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> One-shot Federated Learning Methods: A Practical Guide </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xia Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yijun Song</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Sijie Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zemin Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linshan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09104v1-abstract-short" style="display: inline;"> One-shot Federated Learning (OFL) is a distributed machine learning paradigm that constrains client-server communication to a single round, addressing privacy and communication overhead issues associated with multiple rounds of data exchange in traditional Federated Learning (FL). OFL demonstrates the practical potential for integration with future approaches that require collaborative training mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09104v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09104v1-abstract-full" style="display: none;"> One-shot Federated Learning (OFL) is a distributed machine learning paradigm that constrains client-server communication to a single round, addressing privacy and communication overhead issues associated with multiple rounds of data exchange in traditional Federated Learning (FL). OFL demonstrates the practical potential for integration with future approaches that require collaborative training models, such as large language models (LLMs). However, current OFL methods face two major challenges: data heterogeneity and model heterogeneity, which result in subpar performance compared to conventional FL methods. Worse still, despite numerous studies addressing these limitations, a comprehensive summary is still lacking. To address these gaps, this paper presents a systematic analysis of the challenges faced by OFL and thoroughly reviews the current methods. We also offer an innovative categorization method and analyze the trade-offs of various techniques. Additionally, we discuss the most promising future directions and the technologies that should be integrated into the OFL field. This work aims to provide guidance and insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09104v1-abstract-full').style.display = 'none'; document.getElementById('2502.09104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09093">arXiv:2502.09093</a> <span> [<a href="https://arxiv.org/pdf/2502.09093">pdf</a>, <a href="https://arxiv.org/format/2502.09093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Visuals to Vocabulary: Establishing Equivalence Between Image and Text Token Through Autoregressive Pre-training in MLLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingxiao Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+F">Fang Qu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhanpeng Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+N">Na Su</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhizhou Zhong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaolong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09093v1-abstract-short" style="display: inline;"> While MLLMs perform well on perceptual tasks, they lack precise multimodal alignment, limiting performance. To address this challenge, we propose Vision Dynamic Embedding-Guided Pretraining (VDEP), a hybrid autoregressive training paradigm for MLLMs. Utilizing dynamic embeddings from the MLP following the visual encoder, this approach supervises image hidden states and integrates image tokens into… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09093v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09093v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09093v1-abstract-full" style="display: none;"> While MLLMs perform well on perceptual tasks, they lack precise multimodal alignment, limiting performance. To address this challenge, we propose Vision Dynamic Embedding-Guided Pretraining (VDEP), a hybrid autoregressive training paradigm for MLLMs. Utilizing dynamic embeddings from the MLP following the visual encoder, this approach supervises image hidden states and integrates image tokens into autoregressive training. Existing MLLMs primarily focused on recovering information from textual inputs, often neglecting the effective processing of image data. In contrast, the key improvement of this work is the reinterpretation of multimodal alignment as a process of recovering information from input data, with particular emphasis on reconstructing detailed visual features.The proposed method seamlessly integrates into standard models without architectural changes. Experiments on 13 benchmarks show VDEP outperforms baselines, surpassing existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09093v1-abstract-full').style.display = 'none'; document.getElementById('2502.09093v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08524">arXiv:2502.08524</a> <span> [<a href="https://arxiv.org/pdf/2502.08524">pdf</a>, <a href="https://arxiv.org/format/2502.08524">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM Pretraining with Continuous Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tack%2C+J">Jihoon Tack</a>, <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jane Yu</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Janice Lan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08524v1-abstract-short" style="display: inline;"> Next token prediction has been the standard training objective used in large language model pretraining. Representations are learned as a result of optimizing for token-level perplexity. We propose Continuous Concept Mixing (CoCoMix), a novel pretraining framework that combines discrete next token prediction with continuous concepts. Specifically, CoCoMix predicts continuous concepts learned from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08524v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08524v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08524v1-abstract-full" style="display: none;"> Next token prediction has been the standard training objective used in large language model pretraining. Representations are learned as a result of optimizing for token-level perplexity. We propose Continuous Concept Mixing (CoCoMix), a novel pretraining framework that combines discrete next token prediction with continuous concepts. Specifically, CoCoMix predicts continuous concepts learned from a pretrained sparse autoencoder and mixes them into the model's hidden state by interleaving with token hidden representations. Through experiments on multiple benchmarks, including language modeling and downstream reasoning tasks, we show that CoCoMix is more sample efficient and consistently outperforms standard next token prediction, knowledge distillation and inserting pause tokens. We find that combining both concept learning and interleaving in an end-to-end framework is critical to performance gains. Furthermore, CoCoMix enhances interpretability and steerability by allowing direct inspection and modification of the predicted concept, offering a transparent way to guide the model's internal reasoning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08524v1-abstract-full').style.display = 'none'; document.getElementById('2502.08524v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08172">arXiv:2502.08172</a> <span> [<a href="https://arxiv.org/pdf/2502.08172">pdf</a>, <a href="https://arxiv.org/format/2502.08172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Intention is All You Need: Refining Your Code from Your Intention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qi Guo</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaofei Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shangqing Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Ming Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohong Li</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+L">Lei Bu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08172v1-abstract-short" style="display: inline;"> Code refinement aims to enhance existing code by addressing issues, refactoring, and optimizing to improve quality and meet specific requirements. As software projects scale in size and complexity, the traditional iterative exchange between reviewers and developers becomes increasingly burdensome. While recent deep learning techniques have been explored to accelerate this process, their performanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08172v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08172v1-abstract-full" style="display: none;"> Code refinement aims to enhance existing code by addressing issues, refactoring, and optimizing to improve quality and meet specific requirements. As software projects scale in size and complexity, the traditional iterative exchange between reviewers and developers becomes increasingly burdensome. While recent deep learning techniques have been explored to accelerate this process, their performance remains limited, primarily due to challenges in accurately understanding reviewers' intents. This paper proposes an intention-based code refinement technique that enhances the conventional comment-to-code process by explicitly extracting reviewer intentions from the comments. Our approach consists of two key phases: Intention Extraction and Intention Guided Revision Generation. Intention Extraction categorizes comments using predefined templates, while Intention Guided Revision Generation employs large language models (LLMs) to generate revised code based on these defined intentions. Three categories with eight subcategories are designed for comment transformation, which is followed by a hybrid approach that combines rule-based and LLM-based classifiers for accurate classification. Extensive experiments with five LLMs (GPT4o, GPT3.5, DeepSeekV2, DeepSeek7B, CodeQwen7B) under different prompting settings demonstrate that our approach achieves 79% accuracy in intention extraction and up to 66% in code refinement generation. Our results highlight the potential of our approach in enhancing data quality and improving the efficiency of code refinement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08172v1-abstract-full').style.display = 'none'; document.getElementById('2502.08172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08150">arXiv:2502.08150</a> <span> [<a href="https://arxiv.org/pdf/2502.08150">pdf</a>, <a href="https://arxiv.org/format/2502.08150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Force Matching with Relativistic Constraints: A Physics-Inspired Approach to Stable and Efficient Generative Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Z">Zhizhou Sha</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+M">Mingda Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08150v1-abstract-short" style="display: inline;"> This paper introduces Force Matching (ForM), a novel framework for generative modeling that represents an initial exploration into leveraging special relativistic mechanics to enhance the stability of the sampling process. By incorporating the Lorentz factor, ForM imposes a velocity constraint, ensuring that sample velocities remain bounded within a constant limit. This constraint serves as a fund… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08150v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08150v1-abstract-full" style="display: none;"> This paper introduces Force Matching (ForM), a novel framework for generative modeling that represents an initial exploration into leveraging special relativistic mechanics to enhance the stability of the sampling process. By incorporating the Lorentz factor, ForM imposes a velocity constraint, ensuring that sample velocities remain bounded within a constant limit. This constraint serves as a fundamental mechanism for stabilizing the generative dynamics, leading to a more robust and controlled sampling process. We provide a rigorous theoretical analysis demonstrating that the velocity constraint is preserved throughout the sampling procedure within the ForM framework. To validate the effectiveness of our approach, we conduct extensive empirical evaluations. On the \textit{half-moons} dataset, ForM significantly outperforms baseline methods, achieving the lowest Euclidean distance loss of \textbf{0.714}, in contrast to vanilla first-order flow matching (5.853) and first- and second-order flow matching (5.793). Additionally, we perform an ablation study to further investigate the impact of our velocity constraint, reaffirming the superiority of ForM in stabilizing the generative process. The theoretical guarantees and empirical results underscore the potential of integrating special relativity principles into generative modeling. Our findings suggest that ForM provides a promising pathway toward achieving stable, efficient, and flexible generative processes. This work lays the foundation for future advancements in high-dimensional generative modeling, opening new avenues for the application of physical principles in machine learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08150v1-abstract-full').style.display = 'none'; document.getElementById('2502.08150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07776">arXiv:2502.07776</a> <span> [<a href="https://arxiv.org/pdf/2502.07776">pdf</a>, <a href="https://arxiv.org/format/2502.07776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Auditing Prompt Caching in Language Model APIs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+C">Chenchen Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X+L">Xiang Lisa Li</a>, <a href="/search/cs?searchtype=author&query=Kuditipudi%2C+R">Rohith Kuditipudi</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P">Percy Liang</a>, <a href="/search/cs?searchtype=author&query=Hashimoto%2C+T">Tatsunori Hashimoto</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07776v1-abstract-short" style="display: inline;"> Prompt caching in large language models (LLMs) results in data-dependent timing variations: cached prompts are processed faster than non-cached prompts. These timing differences introduce the risk of side-channel timing attacks. For example, if the cache is shared across users, an attacker could identify cached prompts from fast API response times to learn information about other users' prompts. B… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07776v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07776v1-abstract-full" style="display: none;"> Prompt caching in large language models (LLMs) results in data-dependent timing variations: cached prompts are processed faster than non-cached prompts. These timing differences introduce the risk of side-channel timing attacks. For example, if the cache is shared across users, an attacker could identify cached prompts from fast API response times to learn information about other users' prompts. Because prompt caching may cause privacy leakage, transparency around the caching policies of API providers is important. To this end, we develop and conduct statistical audits to detect prompt caching in real-world LLM API providers. We detect global cache sharing across users in seven API providers, including OpenAI, resulting in potential privacy leakage about users' prompts. Timing variations due to prompt caching can also result in leakage of information about model architecture. Namely, we find evidence that OpenAI's embedding model is a decoder-only Transformer, which was previously not publicly known. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07776v1-abstract-full').style.display = 'none'; document.getElementById('2502.07776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07658">arXiv:2502.07658</a> <span> [<a href="https://arxiv.org/pdf/2502.07658">pdf</a>, <a href="https://arxiv.org/format/2502.07658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> IU4Rec: Interest Unit-Based Product Organization and Recommendation for E-Commerce Platform </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenhao Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaojie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jialiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qinye Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qingheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yin Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shuguang Han</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junfeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07658v1-abstract-short" style="display: inline;"> Most recommendation systems typically follow a product-based paradigm utilizing user-product interactions to identify the most engaging items for users. However, this product-based paradigm has notable drawbacks for Xianyu~\footnote{Xianyu is China's largest online C2C e-commerce platform where a large portion of the product are post by individual sellers}. Most of the product on Xianyu posted fro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07658v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07658v1-abstract-full" style="display: none;"> Most recommendation systems typically follow a product-based paradigm utilizing user-product interactions to identify the most engaging items for users. However, this product-based paradigm has notable drawbacks for Xianyu~\footnote{Xianyu is China's largest online C2C e-commerce platform where a large portion of the product are post by individual sellers}. Most of the product on Xianyu posted from individual sellers often have limited stock available for distribution, and once the product is sold, it's no longer available for distribution. This result in most items distributed product on Xianyu having relatively few interactions, affecting the effectiveness of traditional recommendation depending on accumulating user-item interactions. To address these issues, we introduce \textbf{IU4Rec}, an \textbf{I}nterest \textbf{U}nit-based two-stage \textbf{Rec}ommendation system framework. We first group products into clusters based on attributes such as category, image, and semantics. These IUs are then integrated into the Recommendation system, delivering both product and technological innovations. IU4Rec begins by grouping products into clusters based on attributes such as category, image, and semantics, forming Interest Units (IUs). Then we redesign the recommendation process into two stages. In the first stage, the focus is on recommend these Interest Units, capturing broad-level interests. In the second stage, it guides users to find the best option among similar products within the selected Interest Unit. User-IU interactions are incorporated into our ranking models, offering the advantage of more persistent IU behaviors compared to item-specific interactions. Experimental results on the production dataset and online A/B testing demonstrate the effectiveness and superiority of our proposed IU-centric recommendation approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07658v1-abstract-full').style.display = 'none'; document.getElementById('2502.07658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review at KDD25 ADS. This work has already been deployed on the Xianyu platform in Alibaba. arXiv admin note: substantial text overlap with arXiv:2403.06747</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07599">arXiv:2502.07599</a> <span> [<a href="https://arxiv.org/pdf/2502.07599">pdf</a>, <a href="https://arxiv.org/format/2502.07599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DPO-Shift: Shifting the Distribution of Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiliang Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qianen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lei Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07599v1-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) and its variants have become increasingly popular for aligning language models with human preferences. These methods aim to teach models to better distinguish between chosen (or preferred) and rejected (or dispreferred) responses. However, prior research has identified that the probability of chosen responses often decreases during training, and this phenomenon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07599v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07599v1-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) and its variants have become increasingly popular for aligning language models with human preferences. These methods aim to teach models to better distinguish between chosen (or preferred) and rejected (or dispreferred) responses. However, prior research has identified that the probability of chosen responses often decreases during training, and this phenomenon is known as likelihood displacement. To tackle this challenge, in this work we introduce \method to controllably shift the distribution of the chosen probability. Then, we show that \method exhibits a fundamental trade-off between improving the chosen probability and sacrificing the reward margin, as supported by both theoretical analysis and experimental validation. Furthermore, we demonstrate the superiority of \method over DPO on downstream tasks such as MT-Bench and a designed win rate experiment. We believe this study shows that the likelihood displacement issue of DPO can be effectively mitigated with a simple, theoretically grounded solution. Our code is available at https://github.com/Meaquadddd/DPO-Shift. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07599v1-abstract-full').style.display = 'none'; document.getElementById('2502.07599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07472">arXiv:2502.07472</a> <span> [<a href="https://arxiv.org/pdf/2502.07472">pdf</a>, <a href="https://arxiv.org/format/2502.07472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Robotic In-Hand Manipulation for Large-Range Precise Object Movement: The RGMC Champion Solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+M">Mingrui Yu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yongpeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yongyi Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07472v1-abstract-short" style="display: inline;"> In-hand manipulation using multiple dexterous fingers is a critical robotic skill that can reduce the reliance on large arm motions, thereby saving space and energy. This letter focuses on in-grasp object movement, which refers to manipulating an object to a desired pose through only finger motions within a stable grasp. The key challenge lies in simultaneously achieving high precision and large-r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07472v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07472v1-abstract-full" style="display: none;"> In-hand manipulation using multiple dexterous fingers is a critical robotic skill that can reduce the reliance on large arm motions, thereby saving space and energy. This letter focuses on in-grasp object movement, which refers to manipulating an object to a desired pose through only finger motions within a stable grasp. The key challenge lies in simultaneously achieving high precision and large-range movements while maintaining a constant stable grasp. To address this problem, we propose a simple and practical approach based on kinematic trajectory optimization with no need for pretraining or object geometries, which can be easily applied to novel objects in real-world scenarios. Adopting this approach, we won the championship for the in-hand manipulation track at the 9th Robotic Grasping and Manipulation Competition (RGMC) held at ICRA 2024. Implementation details, discussion, and further quantitative experimental results are presented in this letter, which aims to comprehensively evaluate our approach and share our key takeaways from the competition. Supplementary materials including video and code are available at https://rgmc-xl-team.github.io/ingrasp_manipulation . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07472v1-abstract-full').style.display = 'none'; document.getElementById('2502.07472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to RA-L. Project website: https://rgmc-xl-team.github.io/ingrasp_manipulation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07358">arXiv:2502.07358</a> <span> [<a href="https://arxiv.org/pdf/2502.07358">pdf</a>, <a href="https://arxiv.org/format/2502.07358">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SymbioSim: Human-in-the-loop Simulation Platform for Bidirectional Continuing Learning in Human-Robot Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haoran Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yiteng Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yiming Ren</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yaoqin Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinran Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+P">Peishan Cong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bushi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+X">Xiaokun Leng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Manyi Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yuexin Ma</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+C">Changhe Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07358v1-abstract-short" style="display: inline;"> The development of intelligent robots seeks to seamlessly integrate them into the human world, providing assistance and companionship in daily life and work, with the ultimate goal of achieving human-robot symbiosis. To realize this vision, robots must continuously learn and evolve through consistent interaction and collaboration with humans, while humans need to gradually develop an understanding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07358v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07358v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07358v1-abstract-full" style="display: none;"> The development of intelligent robots seeks to seamlessly integrate them into the human world, providing assistance and companionship in daily life and work, with the ultimate goal of achieving human-robot symbiosis. To realize this vision, robots must continuously learn and evolve through consistent interaction and collaboration with humans, while humans need to gradually develop an understanding of and trust in robots through shared experiences. However, training and testing algorithms directly on physical robots involve substantial costs and safety risks. Moreover, current robotic simulators fail to support real human participation, limiting their ability to provide authentic interaction experiences and gather valuable human feedback. In this paper, we introduce SymbioSim, a novel human-in-the-loop robotic simulation platform designed to enable the safe and efficient development, evaluation, and optimization of human-robot interactions. By leveraging a carefully designed system architecture and modules, SymbioSim delivers a natural and realistic interaction experience, facilitating bidirectional continuous learning and adaptation for both humans and robots. Extensive experiments and user studies demonstrate the platform's promising performance and highlight its potential to significantly advance research on human-robot symbiosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07358v1-abstract-full').style.display = 'none'; document.getElementById('2502.07358v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07309">arXiv:2502.07309</a> <span> [<a href="https://arxiv.org/pdf/2502.07309">pdf</a>, <a href="https://arxiv.org/format/2502.07309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Vision-Centric 3D Occupancy World Model for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengfei Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yupeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wei Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yilun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07309v1-abstract-short" style="display: inline;"> Understanding world dynamics is crucial for planning in autonomous driving. Recent methods attempt to achieve this by learning a 3D occupancy world model that forecasts future surrounding scenes based on current observation. However, 3D occupancy labels are still required to produce promising results. Considering the high annotation cost for 3D outdoor scenes, we propose a semi-supervised vision-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07309v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07309v1-abstract-full" style="display: none;"> Understanding world dynamics is crucial for planning in autonomous driving. Recent methods attempt to achieve this by learning a 3D occupancy world model that forecasts future surrounding scenes based on current observation. However, 3D occupancy labels are still required to produce promising results. Considering the high annotation cost for 3D outdoor scenes, we propose a semi-supervised vision-centric 3D occupancy world model, PreWorld, to leverage the potential of 2D labels through a novel two-stage training paradigm: the self-supervised pre-training stage and the fully-supervised fine-tuning stage. Specifically, during the pre-training stage, we utilize an attribute projection head to generate different attribute fields of a scene (e.g., RGB, density, semantic), thus enabling temporal supervision from 2D labels via volume rendering techniques. Furthermore, we introduce a simple yet effective state-conditioned forecasting module to recursively forecast future occupancy and ego trajectory in a direct manner. Extensive experiments on the nuScenes dataset validate the effectiveness and scalability of our method, and demonstrate that PreWorld achieves competitive performance across 3D occupancy prediction, 4D occupancy forecasting and motion planning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07309v1-abstract-full').style.display = 'none'; document.getElementById('2502.07309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07205">arXiv:2502.07205</a> <span> [<a href="https://arxiv.org/pdf/2502.07205">pdf</a>, <a href="https://arxiv.org/format/2502.07205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VINP: Variational Bayesian Inference with Neural Speech Prior for Joint ASR-Effective Speech Dereverberation and Blind RIR Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Ying Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaofei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07205v1-abstract-short" style="display: inline;"> Reverberant speech, denoting the speech signal degraded by the process of reverberation, contains crucial knowledge of both anechoic source speech and room impulse response (RIR). This work proposes a variational Bayesian inference (VBI) framework with neural speech prior (VINP) for joint speech dereverberation and blind RIR identification. In VINP, a probabilistic signal model is constructed in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07205v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07205v1-abstract-full" style="display: none;"> Reverberant speech, denoting the speech signal degraded by the process of reverberation, contains crucial knowledge of both anechoic source speech and room impulse response (RIR). This work proposes a variational Bayesian inference (VBI) framework with neural speech prior (VINP) for joint speech dereverberation and blind RIR identification. In VINP, a probabilistic signal model is constructed in the time-frequency (T-F) domain based on convolution transfer function (CTF) approximation. For the first time, we propose using an arbitrary discriminative dereverberation deep neural network (DNN) to predict the prior distribution of anechoic speech within a probabilistic model. By integrating both reverberant speech and the anechoic speech prior, VINP yields the maximum a posteriori (MAP) and maximum likelihood (ML) estimations of the anechoic speech spectrum and CTF filter, respectively. After simple transformations, the waveforms of anechoic speech and RIR are estimated. Moreover, VINP is effective for automatic speech recognition (ASR) systems, which sets it apart from most deep learning (DL)-based single-channel dereverberation approaches. Experiments on single-channel speech dereverberation demonstrate that VINP reaches an advanced level in most metrics related to human perception and displays unquestionable state-of-the-art (SOTA) performance in ASR-related metrics. For blind RIR identification, experiments indicate that VINP attains the SOTA level in blind estimation of reverberation time at 60 dB (RT60) and direct-to-reverberation ratio (DRR). Codes and audio samples are available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07205v1-abstract-full').style.display = 'none'; document.getElementById('2502.07205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE/ACM Trans. on TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06911">arXiv:2502.06911</a> <span> [<a href="https://arxiv.org/pdf/2502.06911">pdf</a>, <a href="https://arxiv.org/format/2502.06911">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Foundation Models for Anomaly Detection: Vision and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jing Ren</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tao Tang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+H">Hong Jia</a>, <a href="/search/cs?searchtype=author&query=Fayek%2C+H">Haytham Fayek</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaodong Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Suyu Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiwei Xu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Feng Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06911v1-abstract-short" style="display: inline;"> As data continues to grow in volume and complexity across domains such as finance, manufacturing, and healthcare, effective anomaly detection is essential for identifying irregular patterns that may signal critical issues. Recently, foundation models (FMs) have emerged as a powerful tool for advancing anomaly detection. They have demonstrated unprecedented capabilities in enhancing anomaly identif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06911v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06911v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06911v1-abstract-full" style="display: none;"> As data continues to grow in volume and complexity across domains such as finance, manufacturing, and healthcare, effective anomaly detection is essential for identifying irregular patterns that may signal critical issues. Recently, foundation models (FMs) have emerged as a powerful tool for advancing anomaly detection. They have demonstrated unprecedented capabilities in enhancing anomaly identification, generating detailed data descriptions, and providing visual explanations. This survey presents the first comprehensive review of recent advancements in FM-based anomaly detection. We propose a novel taxonomy that classifies FMs into three categories based on their roles in anomaly detection tasks, i.e., as encoders, detectors, or interpreters. We provide a systematic analysis of state-of-the-art methods and discuss key challenges in leveraging FMs for improved anomaly detection. We also outline future research directions in this rapidly evolving field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06911v1-abstract-full').style.display = 'none'; document.getElementById('2502.06911v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06909">arXiv:2502.06909</a> <span> [<a href="https://arxiv.org/pdf/2502.06909">pdf</a>, <a href="https://arxiv.org/format/2502.06909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Satisfaction-Aware Incentive Scheme for Federated Learning in Industrial Metaverse: DRL-Based Stackbelberg Game Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohuan Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shaowen Qin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xin Tang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhonghua Zhao</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06909v1-abstract-short" style="display: inline;"> Industrial Metaverse leverages the Industrial Internet of Things (IIoT) to integrate data from diverse devices, employing federated learning and meta-computing to train models in a distributed manner while ensuring data privacy. Achieving an immersive experience for industrial Metaverse necessitates maintaining a balance between model quality and training latency. Consequently, a primary challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06909v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06909v1-abstract-full" style="display: none;"> Industrial Metaverse leverages the Industrial Internet of Things (IIoT) to integrate data from diverse devices, employing federated learning and meta-computing to train models in a distributed manner while ensuring data privacy. Achieving an immersive experience for industrial Metaverse necessitates maintaining a balance between model quality and training latency. Consequently, a primary challenge in federated learning tasks is optimizing overall system performance by balancing model quality and training latency. This paper designs a satisfaction function that accounts for data size, Age of Information (AoI), and training latency. Additionally, the satisfaction function is incorporated into the utility functions to incentivize node participation in model training. We model the utility functions of servers and nodes as a two-stage Stackelberg game and employ a deep reinforcement learning approach to learn the Stackelberg equilibrium. This approach ensures balanced rewards and enhances the applicability of the incentive scheme for industrial Metaverse. Simulation results demonstrate that, under the same budget constraints, the proposed incentive scheme improves at least 23.7% utility compared to existing schemes without compromising model accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06909v1-abstract-full').style.display = 'none'; document.getElementById('2502.06909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06788">arXiv:2502.06788</a> <span> [<a href="https://arxiv.org/pdf/2502.06788">pdf</a>, <a href="https://arxiv.org/format/2502.06788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EVEv2: Improved Baselines for Encoder-Free Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaotong Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yufeng Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yueze Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoge Deng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Ting Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06788v1-abstract-short" style="display: inline;"> Existing encoder-free vision-language models (VLMs) are rapidly narrowing the performance gap with their encoder-based counterparts, highlighting the promising potential for unified multimodal systems with structural simplicity and efficient deployment. We systematically clarify the performance gap between VLMs using pre-trained vision encoders, discrete tokenizers, and minimalist visual layers fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06788v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06788v1-abstract-full" style="display: none;"> Existing encoder-free vision-language models (VLMs) are rapidly narrowing the performance gap with their encoder-based counterparts, highlighting the promising potential for unified multimodal systems with structural simplicity and efficient deployment. We systematically clarify the performance gap between VLMs using pre-trained vision encoders, discrete tokenizers, and minimalist visual layers from scratch, deeply excavating the under-examined characteristics of encoder-free VLMs. We develop efficient strategies for encoder-free VLMs that rival mainstream encoder-based ones. After an in-depth investigation, we launch EVEv2.0, a new and improved family of encoder-free VLMs. We show that: (i) Properly decomposing and hierarchically associating vision and language within a unified model reduces interference between modalities. (ii) A well-designed training strategy enables effective optimization for encoder-free VLMs. Through extensive evaluation, our EVEv2.0 represents a thorough study for developing a decoder-only architecture across modalities, demonstrating superior data efficiency and strong vision-reasoning capability. Code is publicly available at: https://github.com/baaivision/EVE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06788v1-abstract-full').style.display = 'none'; document.getElementById('2502.06788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06782">arXiv:2502.06782</a> <span> [<a href="https://arxiv.org/pdf/2502.06782">pdf</a>, <a href="https://arxiv.org/format/2502.06782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Lumina-Video: Efficient and Flexible Video Generation with Multi-scale Next-DiT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dongyang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shicheng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yutong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Q">Qi Qin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yufei Liu</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yi Xin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenyang Si</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuewen Cao</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Q">Qibin Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06782v2-abstract-short" style="display: inline;"> Recent advancements have established Diffusion Transformers (DiTs) as a dominant framework in generative modeling. Building on this success, Lumina-Next achieves exceptional performance in the generation of photorealistic images with Next-DiT. However, its potential for video generation remains largely untapped, with significant challenges in modeling the spatiotemporal complexity inherent to vide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06782v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06782v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06782v2-abstract-full" style="display: none;"> Recent advancements have established Diffusion Transformers (DiTs) as a dominant framework in generative modeling. Building on this success, Lumina-Next achieves exceptional performance in the generation of photorealistic images with Next-DiT. However, its potential for video generation remains largely untapped, with significant challenges in modeling the spatiotemporal complexity inherent to video data. To address this, we introduce Lumina-Video, a framework that leverages the strengths of Next-DiT while introducing tailored solutions for video synthesis. Lumina-Video incorporates a Multi-scale Next-DiT architecture, which jointly learns multiple patchifications to enhance both efficiency and flexibility. By incorporating the motion score as an explicit condition, Lumina-Video also enables direct control of generated videos' dynamic degree. Combined with a progressive training scheme with increasingly higher resolution and FPS, and a multi-source training scheme with mixed natural and synthetic data, Lumina-Video achieves remarkable aesthetic quality and motion smoothness at high training and inference efficiency. We additionally propose Lumina-V2A, a video-to-audio model based on Next-DiT, to create synchronized sounds for generated videos. Codes are released at https://www.github.com/Alpha-VLLM/Lumina-Video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06782v2-abstract-full').style.display = 'none'; document.getElementById('2502.06782v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06703">arXiv:2502.06703</a> <span> [<a href="https://arxiv.org/pdf/2502.06703">pdf</a>, <a href="https://arxiv.org/format/2502.06703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can 1B LLM Surpass 405B LLM? Rethinking Compute-Optimal Test-Time Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+R">Runze Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junqi Gao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+B">Biqing Qi</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06703v1-abstract-short" style="display: inline;"> Test-Time Scaling (TTS) is an important method for improving the performance of Large Language Models (LLMs) by using additional computation during the inference phase. However, current studies do not systematically analyze how policy models, Process Reward Models (PRMs), and problem difficulty influence TTS. This lack of analysis limits the understanding and practical use of TTS methods. In this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06703v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06703v1-abstract-full" style="display: none;"> Test-Time Scaling (TTS) is an important method for improving the performance of Large Language Models (LLMs) by using additional computation during the inference phase. However, current studies do not systematically analyze how policy models, Process Reward Models (PRMs), and problem difficulty influence TTS. This lack of analysis limits the understanding and practical use of TTS methods. In this paper, we focus on two core questions: (1) What is the optimal approach to scale test-time computation across different policy models, PRMs, and problem difficulty levels? (2) To what extent can extended computation improve the performance of LLMs on complex tasks, and can smaller language models outperform larger ones through this approach? Through comprehensive experiments on MATH-500 and challenging AIME24 tasks, we have the following observations: (1) The compute-optimal TTS strategy is highly dependent on the choice of policy model, PRM, and problem difficulty. (2) With our compute-optimal TTS strategy, extremely small policy models can outperform larger models. For example, a 1B LLM can exceed a 405B LLM on MATH-500. Moreover, on both MATH-500 and AIME24, a 0.5B LLM outperforms GPT-4o, a 3B LLM surpasses a 405B LLM, and a 7B LLM beats o1 and DeepSeek-R1, while with higher inference efficiency. These findings show the significance of adapting TTS strategies to the specific characteristics of each task and model and indicate that TTS is a promising approach for enhancing the reasoning abilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06703v1-abstract-full').style.display = 'none'; document.getElementById('2502.06703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06544">arXiv:2502.06544</a> <span> [<a href="https://arxiv.org/pdf/2502.06544">pdf</a>, <a href="https://arxiv.org/format/2502.06544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sequence Transferability and Task Order Selection in Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thinh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+C+N">Cuong N. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Pham%2C+Q">Quang Pham</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+B+T">Binh T. Nguyen</a>, <a href="/search/cs?searchtype=author&query=Ramasamy%2C+S">Savitha Ramasamy</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoli Li</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+C+V">Cuong V. Nguyen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06544v1-abstract-short" style="display: inline;"> In continual learning, understanding the properties of task sequences and their relationships to model performance is important for developing advanced algorithms with better accuracy. However, efforts in this direction remain underdeveloped despite encouraging progress in methodology development. In this work, we investigate the impacts of sequence transferability on continual learning and propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06544v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06544v1-abstract-full" style="display: none;"> In continual learning, understanding the properties of task sequences and their relationships to model performance is important for developing advanced algorithms with better accuracy. However, efforts in this direction remain underdeveloped despite encouraging progress in methodology development. In this work, we investigate the impacts of sequence transferability on continual learning and propose two novel measures that capture the total transferability of a task sequence, either in the forward or backward direction. Based on the empirical properties of these measures, we then develop a new method for the task order selection problem in continual learning. Our method can be shown to offer a better performance than the conventional strategy of random task selection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06544v1-abstract-full').style.display = 'none'; document.getElementById('2502.06544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T45; 68T01 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06452">arXiv:2502.06452</a> <span> [<a href="https://arxiv.org/pdf/2502.06452">pdf</a>, <a href="https://arxiv.org/format/2502.06452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> SparseFocus: Learning-based One-shot Autofocus for Microscopy with Sparse Content </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhai%2C+Y">Yongping Zhai</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xiaoxi Fu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Q">Qiang Su</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jia Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yake Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunfeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chaofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxin Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dongdong Wu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shen Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06452v1-abstract-short" style="display: inline;"> Autofocus is necessary for high-throughput and real-time scanning in microscopic imaging. Traditional methods rely on complex hardware or iterative hill-climbing algorithms. Recent learning-based approaches have demonstrated remarkable efficacy in a one-shot setting, avoiding hardware modifications or iterative mechanical lens adjustments. However, in this paper, we highlight a significant challen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06452v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06452v1-abstract-full" style="display: none;"> Autofocus is necessary for high-throughput and real-time scanning in microscopic imaging. Traditional methods rely on complex hardware or iterative hill-climbing algorithms. Recent learning-based approaches have demonstrated remarkable efficacy in a one-shot setting, avoiding hardware modifications or iterative mechanical lens adjustments. However, in this paper, we highlight a significant challenge that the richness of image content can significantly affect autofocus performance. When the image content is sparse, previous autofocus methods, whether traditional climbing-hill or learning-based, tend to fail. To tackle this, we propose a content-importance-based solution, named SparseFocus, featuring a novel two-stage pipeline. The first stage measures the importance of regions within the image, while the second stage calculates the defocus distance from selected important regions. To validate our approach and benefit the research community, we collect a large-scale dataset comprising millions of labelled defocused images, encompassing both dense, sparse and extremely sparse scenarios. Experimental results show that SparseFocus surpasses existing methods, effectively handling all levels of content sparsity. Moreover, we integrate SparseFocus into our Whole Slide Imaging (WSI) system that performs well in real-world applications. The code and dataset will be made available upon the publication of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06452v1-abstract-full').style.display = 'none'; document.getElementById('2502.06452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06387">arXiv:2502.06387</a> <span> [<a href="https://arxiv.org/pdf/2502.06387">pdf</a>, <a href="https://arxiv.org/format/2502.06387">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Theoretical Economics">econ.TH</span> </div> </div> <p class="title is-5 mathjax"> How Humans Help LLMs: Assessing and Incentivizing Human Preference Annotators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanzhao Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhongyao Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaocheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06387v1-abstract-short" style="display: inline;"> Human-annotated preference data play an important role in aligning large language models (LLMs). In this paper, we investigate the questions of assessing the performance of human annotators and incentivizing them to provide high-quality annotations. The quality assessment of language/text annotation faces two challenges: (i) the intrinsic heterogeneity among annotators, which prevents the classic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06387v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06387v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06387v1-abstract-full" style="display: none;"> Human-annotated preference data play an important role in aligning large language models (LLMs). In this paper, we investigate the questions of assessing the performance of human annotators and incentivizing them to provide high-quality annotations. The quality assessment of language/text annotation faces two challenges: (i) the intrinsic heterogeneity among annotators, which prevents the classic methods that assume the underlying existence of a true label; and (ii) the unclear relationship between the annotation quality and the performance of downstream tasks, which excludes the possibility of inferring the annotators' behavior based on the model performance trained from the annotation data. Then we formulate a principal-agent model to characterize the behaviors of and the interactions between the company and the human annotators. The model rationalizes a practical mechanism of a bonus scheme to incentivize annotators which benefits both parties and it underscores the importance of the joint presence of an assessment system and a proper contract scheme. From a technical perspective, our analysis extends the existing literature on the principal-agent model by considering a continuous action space for the agent. We show the gap between the first-best and the second-best solutions (under the continuous action space) is of $螛(1/\sqrt{n \log n})$ for the binary contracts and $螛(1/n)$ for the linear contracts, where $n$ is the number of samples used for performance assessment; this contrasts with the known result of $\exp(-螛(n))$ for the binary contracts when the action space is discrete. Throughout the paper, we use real preference annotation data to accompany our discussions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06387v1-abstract-full').style.display = 'none'; document.getElementById('2502.06387v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06238">arXiv:2502.06238</a> <span> [<a href="https://arxiv.org/pdf/2502.06238">pdf</a>, <a href="https://arxiv.org/format/2502.06238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> XNet-Enhanced Deep BSDE Method and Numerical Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaotao Zheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihong Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xingye Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06238v1-abstract-short" style="display: inline;"> Solving high-dimensional semilinear parabolic partial differential equations (PDEs) challenges traditional numerical methods due to the "curse of dimensionality." Deep learning, particularly through the Deep BSDE method, offers a promising alternative by leveraging neural networks' capability to approximate high-dimensional functions. This paper introduces a novel network architecture, XNet, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06238v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06238v1-abstract-full" style="display: none;"> Solving high-dimensional semilinear parabolic partial differential equations (PDEs) challenges traditional numerical methods due to the "curse of dimensionality." Deep learning, particularly through the Deep BSDE method, offers a promising alternative by leveraging neural networks' capability to approximate high-dimensional functions. This paper introduces a novel network architecture, XNet, which significantly enhances the computational efficiency and accuracy of the Deep BSDE method. XNet demonstrates superior approximation capabilities with fewer parameters, addressing the trade-off between approximation and optimization errors found in existing methods. We detail the implementation of XNet within the Deep BSDE framework and present results that show marked improvements in solving high-dimensional PDEs, potentially setting a new standard for such computations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06238v1-abstract-full').style.display = 'none'; document.getElementById('2502.06238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06196">arXiv:2502.06196</a> <span> [<a href="https://arxiv.org/pdf/2502.06196">pdf</a>, <a href="https://arxiv.org/format/2502.06196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Improved Extrinsic Calibration of Acoustic Cameras via Batch Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">He Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06196v1-abstract-short" style="display: inline;"> Acoustic cameras have found many applications in practice. Accurate and reliable extrinsic calibration of the microphone array and visual sensors within acoustic cameras is crucial for fusing visual and auditory measurements. Existing calibration methods either require prior knowledge of the microphone array geometry or rely on grid search which suffers from slow iteration speed or poor convergenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06196v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06196v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06196v1-abstract-full" style="display: none;"> Acoustic cameras have found many applications in practice. Accurate and reliable extrinsic calibration of the microphone array and visual sensors within acoustic cameras is crucial for fusing visual and auditory measurements. Existing calibration methods either require prior knowledge of the microphone array geometry or rely on grid search which suffers from slow iteration speed or poor convergence. To overcome these limitations, in this paper, we propose an automatic calibration technique using a calibration board with both visual and acoustic markers to identify each microphone position in the camera frame. We formulate the extrinsic calibration problem (between microphones and the visual sensor) as a nonlinear least squares problem and employ a batch optimization strategy to solve the associated problem. Extensive numerical simulations and realworld experiments show that the proposed method improves both the accuracy and robustness of extrinsic parameter calibration for acoustic cameras, in comparison to existing methods. To benefit the community, we open-source all the codes and data at https://github.com/AISLAB-sustech/AcousticCamera. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06196v1-abstract-full').style.display = 'none'; document.getElementById('2502.06196v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted and is going to be presented at ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06167">arXiv:2502.06167</a> <span> [<a href="https://arxiv.org/pdf/2502.06167">pdf</a>, <a href="https://arxiv.org/format/2502.06167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Universal Approximation of Visual Autoregressive Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yifang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoyu Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yingyu Liang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenmei Shi</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06167v1-abstract-short" style="display: inline;"> We investigate the fundamental limits of transformer-based foundation models, extending our analysis to include Visual Autoregressive (VAR) transformers. VAR represents a big step toward generating images using a novel, scalable, coarse-to-fine ``next-scale prediction'' framework. These models set a new quality bar, outperforming all previous methods, including Diffusion Transformers, while having… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06167v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06167v1-abstract-full" style="display: none;"> We investigate the fundamental limits of transformer-based foundation models, extending our analysis to include Visual Autoregressive (VAR) transformers. VAR represents a big step toward generating images using a novel, scalable, coarse-to-fine ``next-scale prediction'' framework. These models set a new quality bar, outperforming all previous methods, including Diffusion Transformers, while having state-of-the-art performance for image synthesis tasks. Our primary contributions establish that, for single-head VAR transformers with a single self-attention layer and single interpolation layer, the VAR Transformer is universal. From the statistical perspective, we prove that such simple VAR transformers are universal approximators for any image-to-image Lipschitz functions. Furthermore, we demonstrate that flow-based autoregressive transformers inherit similar approximation capabilities. Our results provide important design principles for effective and computationally efficient VAR Transformer strategies that can be used to extend their utility to more sophisticated VAR models in image generation and other related areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06167v1-abstract-full').style.display = 'none'; document.getElementById('2502.06167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05943">arXiv:2502.05943</a> <span> [<a href="https://arxiv.org/pdf/2502.05943">pdf</a>, <a href="https://arxiv.org/format/2502.05943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Sustainable Adaptation for Autonomous Driving with the Mixture of Progressive Experts Networ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yixin Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Chi Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xincheng Li</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+J">Jiaming Xing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanjian Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanjun Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05943v1-abstract-short" style="display: inline;"> Learning-based autonomous driving methods require continuous acquisition of domain knowledge to adapt to diverse driving scenarios. However, due to the inherent challenges of long-tailed data distribution, current approaches still face limitations in complex and dynamic driving environments, particularly when encountering new scenarios and data. This underscores the necessity for enhanced continua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05943v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05943v1-abstract-full" style="display: none;"> Learning-based autonomous driving methods require continuous acquisition of domain knowledge to adapt to diverse driving scenarios. However, due to the inherent challenges of long-tailed data distribution, current approaches still face limitations in complex and dynamic driving environments, particularly when encountering new scenarios and data. This underscores the necessity for enhanced continual learning capabilities to improve system adaptability. To address these challenges, the paper introduces a dynamic progressive optimization framework that facilitates adaptation to variations in dynamic environments, achieved by integrating reinforcement learning and supervised learning for data aggregation. Building on this framework, we propose the Mixture of Progressive Experts (MoPE) network. The proposed method selectively activates multiple expert models based on the distinct characteristics of each task and progressively refines the network architecture to facilitate adaptation to new tasks. Simulation results show that the MoPE model outperforms behavior cloning methods, achieving up to a 7.3% performance improvement in intricate urban road environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05943v1-abstract-full').style.display = 'none'; document.getElementById('2502.05943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05743">arXiv:2502.05743</a> <span> [<a href="https://arxiv.org/pdf/2502.05743">pdf</a>, <a href="https://arxiv.org/format/2502.05743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Understanding Representation Dynamics of Diffusion Models via Low-Dimensional Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhihui Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Q">Qing Qu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05743v1-abstract-short" style="display: inline;"> This work addresses the critical question of why and when diffusion models, despite being designed for generative tasks, can excel at learning high-quality representations in a self-supervised manner. To address this, we develop a mathematical framework based on a low-dimensional data model and posterior estimation, revealing a fundamental trade-off between generation and representation quality ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05743v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05743v1-abstract-full" style="display: none;"> This work addresses the critical question of why and when diffusion models, despite being designed for generative tasks, can excel at learning high-quality representations in a self-supervised manner. To address this, we develop a mathematical framework based on a low-dimensional data model and posterior estimation, revealing a fundamental trade-off between generation and representation quality near the final stage of image generation. Our analysis explains the unimodal representation dynamics across noise scales, mainly driven by the interplay between data denoising and class specification. Building on these insights, we propose an ensemble method that aggregates features across noise levels, significantly improving both clean performance and robustness under label noise. Extensive experiments on both synthetic and real-world datasets validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05743v1-abstract-full').style.display = 'none'; document.getElementById('2502.05743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">First two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05494">arXiv:2502.05494</a> <span> [<a href="https://arxiv.org/pdf/2502.05494">pdf</a>, <a href="https://arxiv.org/format/2502.05494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale Masked Autoencoder for Electrocardiogram Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Ya Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujie Yang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+J">Jianhuang Gan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangjie Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05494v1-abstract-short" style="display: inline;"> Electrocardiogram (ECG) analysis is a fundamental tool for diagnosing cardiovascular conditions, yet anomaly detection in ECG signals remains challenging due to their inherent complexity and variability. We propose Multi-scale Masked Autoencoder for ECG anomaly detection (MMAE-ECG), a novel end-to-end framework that effectively captures both global and local dependencies in ECG data. Unlike state-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05494v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05494v1-abstract-full" style="display: none;"> Electrocardiogram (ECG) analysis is a fundamental tool for diagnosing cardiovascular conditions, yet anomaly detection in ECG signals remains challenging due to their inherent complexity and variability. We propose Multi-scale Masked Autoencoder for ECG anomaly detection (MMAE-ECG), a novel end-to-end framework that effectively captures both global and local dependencies in ECG data. Unlike state-of-the-art methods that rely on heartbeat segmentation or R-peak detection, MMAE-ECG eliminates the need for such pre-processing steps, enhancing its suitability for clinical deployment. MMAE-ECG partitions ECG signals into non-overlapping segments, with each segment assigned learnable positional embeddings. A novel multi-scale masking strategy and multi-scale attention mechanism, along with distinct positional embeddings, enable a lightweight Transformer encoder to effectively capture both local and global dependencies. The masked segments are then reconstructed using a single-layer Transformer block, with an aggregation strategy employed during inference to refine the outputs. Experimental results demonstrate that our method achieves performance comparable to state-of-the-art approaches while significantly reducing computational complexity-approximately 1/78 of the floating-point operations (FLOPs) required for inference. Ablation studies further validate the effectiveness of each component, highlighting the potential of multi-scale masked autoencoders for anomaly detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05494v1-abstract-full').style.display = 'none'; document.getElementById('2502.05494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review in a journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05473">arXiv:2502.05473</a> <span> [<a href="https://arxiv.org/pdf/2502.05473">pdf</a>, <a href="https://arxiv.org/format/2502.05473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LMS-Net: A Learned Mumford-Shah Network For Few-Shot Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengdong Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+F">Fan Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jun Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Liyan Ma</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+S">Shihui Ying</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05473v1-abstract-short" style="display: inline;"> Few-shot semantic segmentation (FSS) methods have shown great promise in handling data-scarce scenarios, particularly in medical image segmentation tasks. However, most existing FSS architectures lack sufficient interpretability and fail to fully incorporate the underlying physical structures of semantic regions. To address these issues, in this paper, we propose a novel deep unfolding network, ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05473v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05473v1-abstract-full" style="display: none;"> Few-shot semantic segmentation (FSS) methods have shown great promise in handling data-scarce scenarios, particularly in medical image segmentation tasks. However, most existing FSS architectures lack sufficient interpretability and fail to fully incorporate the underlying physical structures of semantic regions. To address these issues, in this paper, we propose a novel deep unfolding network, called the Learned Mumford-Shah Network (LMS-Net), for the FSS task. Specifically, motivated by the effectiveness of pixel-to-prototype comparison in prototypical FSS methods and the capability of deep priors to model complex spatial structures, we leverage our learned Mumford-Shah model (LMS model) as a mathematical foundation to integrate these insights into a unified framework. By reformulating the LMS model into prototype update and mask update tasks, we propose an alternating optimization algorithm to solve it efficiently. Further, the iterative steps of this algorithm are unfolded into corresponding network modules, resulting in LMS-Net with clear interpretability. Comprehensive experiments on three publicly available medical segmentation datasets verify the effectiveness of our method, demonstrating superior accuracy and robustness in handling complex structures and adapting to challenging segmentation scenarios. These results highlight the potential of LMS-Net to advance FSS in medical imaging applications. Our code will be available at: https://github.com/SDZhang01/LMSNet <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05473v1-abstract-full').style.display = 'none'; document.getElementById('2502.05473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05472">arXiv:2502.05472</a> <span> [<a href="https://arxiv.org/pdf/2502.05472">pdf</a>, <a href="https://arxiv.org/format/2502.05472">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714915">10.1145/3696410.3714915 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Robust Deep Signed Graph Clustering via Weak Balance Theory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peiyao Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingzhong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xueying Zhu</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+L">Lejian Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05472v1-abstract-short" style="display: inline;"> Signed graph clustering is a critical technique for discovering community structures in graphs that exhibit both positive and negative relationships. We have identified two significant challenges in this domain: i) existing signed spectral methods are highly vulnerable to noise, which is prevalent in real-world scenarios; ii) the guiding principle ``an enemy of my enemy is my friend'', rooted in \… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05472v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05472v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05472v1-abstract-full" style="display: none;"> Signed graph clustering is a critical technique for discovering community structures in graphs that exhibit both positive and negative relationships. We have identified two significant challenges in this domain: i) existing signed spectral methods are highly vulnerable to noise, which is prevalent in real-world scenarios; ii) the guiding principle ``an enemy of my enemy is my friend'', rooted in \textit{Social Balance Theory}, often narrows or disrupts cluster boundaries in mainstream signed graph neural networks. Addressing these challenges, we propose the \underline{D}eep \underline{S}igned \underline{G}raph \underline{C}lustering framework (DSGC), which leverages \textit{Weak Balance Theory} to enhance preprocessing and encoding for robust representation learning. First, DSGC introduces Violation Sign-Refine to denoise the signed network by correcting noisy edges with high-order neighbor information. Subsequently, Density-based Augmentation enhances semantic structures by adding positive edges within clusters and negative edges across clusters, following \textit{Weak Balance} principles. The framework then utilizes \textit{Weak Balance} principles to develop clustering-oriented signed neural networks to broaden cluster boundaries by emphasizing distinctions between negatively linked nodes. Finally, DSGC optimizes clustering assignments by minimizing a regularized clustering loss. Comprehensive experiments on synthetic and real-world datasets demonstrate DSGC consistently outperforms all baselines, establishing a new benchmark in signed graph clustering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05472v1-abstract-full').style.display = 'none'; document.getElementById('2502.05472v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by WWW25 conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05384">arXiv:2502.05384</a> <span> [<a href="https://arxiv.org/pdf/2502.05384">pdf</a>, <a href="https://arxiv.org/format/2502.05384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Demonstrating CavePI: Autonomous Exploration of Underwater Caves by Semantic Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Alankrit Gupta</a>, <a href="/search/cs?searchtype=author&query=Abdullah%2C+A">Adnan Abdullah</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xianyao Li</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+V">Vaishnav Ramesh</a>, <a href="/search/cs?searchtype=author&query=Rekleitis%2C+I">Ioannis Rekleitis</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M+J">Md Jahidul Islam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05384v1-abstract-short" style="display: inline;"> Enabling autonomous robots to safely and efficiently navigate, explore, and map underwater caves is of significant importance to water resource management, hydrogeology, archaeology, and marine robotics. In this work, we demonstrate the system design and algorithmic integration of a visual servoing framework for semantically guided autonomous underwater cave exploration. We present the hardware an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05384v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05384v1-abstract-full" style="display: none;"> Enabling autonomous robots to safely and efficiently navigate, explore, and map underwater caves is of significant importance to water resource management, hydrogeology, archaeology, and marine robotics. In this work, we demonstrate the system design and algorithmic integration of a visual servoing framework for semantically guided autonomous underwater cave exploration. We present the hardware and edge-AI design considerations to deploy this framework on a novel AUV (Autonomous Underwater Vehicle) named CavePI. The guided navigation is driven by a computationally light yet robust deep visual perception module, delivering a rich semantic understanding of the environment. Subsequently, a robust control mechanism enables CavePI to track the semantic guides and navigate within complex cave structures. We evaluate the system through field experiments in natural underwater caves and spring-water sites and further validate its ROS (Robot Operating System)-based digital twin in a simulation environment. Our results highlight how these integrated design choices facilitate reliable navigation under feature-deprived, GPS-denied, and low-visibility conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05384v1-abstract-full').style.display = 'none'; document.getElementById('2502.05384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">V1, 15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05213">arXiv:2502.05213</a> <span> [<a href="https://arxiv.org/pdf/2502.05213">pdf</a>, <a href="https://arxiv.org/format/2502.05213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DERMARK: A Dynamic, Efficient and Robust Multi-bit Watermark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qihao Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chen Tang</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+L">Lan zhang</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+J">Junyang zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangyang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05213v1-abstract-short" style="display: inline;"> Well-trained large language models (LLMs) present significant risks, including potential malicious use and copyright infringement. Current studies aim to trace the distribution of LLM-generated texts by implicitly embedding watermarks. Among these, the single-bit watermarking method can only determine whether a given text was generated by an LLM. In contrast, the multi-bit watermarking method embe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05213v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05213v1-abstract-full" style="display: none;"> Well-trained large language models (LLMs) present significant risks, including potential malicious use and copyright infringement. Current studies aim to trace the distribution of LLM-generated texts by implicitly embedding watermarks. Among these, the single-bit watermarking method can only determine whether a given text was generated by an LLM. In contrast, the multi-bit watermarking method embeds richer information into the generated text, which can identify which LLM generated and distributed a given text to which user. However, existing efforts embed the multi-bit watermark directly into the generated text without accounting for its watermarking capacity. This approach can result in embedding failures when the text's watermarking capacity is insufficient. In this paper, we derive the watermark embedding distribution based on the logits of LLMs and propose a formal inequality to segment the text optimally for watermark embedding. Building on this foundation, we propose DERMARK, a dynamic, efficient, and robust multi-bit watermarking method. DERMARK divides the text into segments of varying lengths for each bit embedding, adaptively matching the text's capacity. It achieves this with negligible overhead and robust performance against text editing by minimizing watermark extraction loss. Comprehensive experiments demonstrate that, compared to the SOTA method, our method reduces the number of tokens required for embedding each bit by 20\%, reduces watermark embedding time by 50\%, and is robust to text editing and watermark erasure attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05213v1-abstract-full').style.display = 'none'; document.getElementById('2502.05213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05147">arXiv:2502.05147</a> <span> [<a href="https://arxiv.org/pdf/2502.05147">pdf</a>, <a href="https://arxiv.org/format/2502.05147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LP-DETR: Layer-wise Progressive Relations for Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+Z">Zhengjian Kang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ye Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xiaoyu Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xintao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongzhe Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05147v2-abstract-short" style="display: inline;"> This paper presents LP-DETR (Layer-wise Progressive DETR), a novel approach that enhances DETR-based object detection through multi-scale relation modeling. Our method introduces learnable spatial relationships between object queries through a relation-aware self-attention mechanism, which adaptively learns to balance different scales of relations (local, medium and global) across decoder layers.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05147v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05147v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05147v2-abstract-full" style="display: none;"> This paper presents LP-DETR (Layer-wise Progressive DETR), a novel approach that enhances DETR-based object detection through multi-scale relation modeling. Our method introduces learnable spatial relationships between object queries through a relation-aware self-attention mechanism, which adaptively learns to balance different scales of relations (local, medium and global) across decoder layers. This progressive design enables the model to effectively capture evolving spatial dependencies throughout the detection pipeline. Extensive experiments on COCO 2017 dataset demonstrate that our method improves both convergence speed and detection accuracy compared to standard self-attention module. The proposed method achieves competitive results, reaching 52.3\% AP with 12 epochs and 52.5\% AP with 24 epochs using ResNet-50 backbone, and further improving to 58.0\% AP with Swin-L backbone. Furthermore, our analysis reveals an interesting pattern: the model naturally learns to prioritize local spatial relations in early decoder layers while gradually shifting attention to broader contexts in deeper layers, providing valuable insights for future research in object detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05147v2-abstract-full').style.display = 'none'; document.getElementById('2502.05147v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04917">arXiv:2502.04917</a> <span> [<a href="https://arxiv.org/pdf/2502.04917">pdf</a>, <a href="https://arxiv.org/format/2502.04917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Complex Physics-Informed Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenhao Si</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihong Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04917v1-abstract-short" style="display: inline;"> We propose compleX-PINN, a novel physics-informed neural network (PINN) architecture that incorporates a learnable activation function inspired by Cauchy integral theorem. By learning the parameters of the activation function, compleX-PINN achieves high accuracy with just a single hidden layer. Empirical results show that compleX-PINN effectively solves problems where traditional PINNs struggle an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04917v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04917v1-abstract-full" style="display: none;"> We propose compleX-PINN, a novel physics-informed neural network (PINN) architecture that incorporates a learnable activation function inspired by Cauchy integral theorem. By learning the parameters of the activation function, compleX-PINN achieves high accuracy with just a single hidden layer. Empirical results show that compleX-PINN effectively solves problems where traditional PINNs struggle and consistently delivers significantly higher precision, often by an order of magnitude. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04917v1-abstract-full').style.display = 'none'; document.getElementById('2502.04917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04632">arXiv:2502.04632</a> <span> [<a href="https://arxiv.org/pdf/2502.04632">pdf</a>, <a href="https://arxiv.org/ps/2502.04632">ps</a>, <a href="https://arxiv.org/format/2502.04632">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Tight Bounds for Noisy Computation of High-Influence Functions, Connectivity, and Threshold </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuzhou Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinzhan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04632v2-abstract-short" style="display: inline;"> In the noisy query model, the (binary) return value of every query (possibly repeated) is independently flipped with some fixed probability $p \in (0, 1/2)$. In this paper, we obtain tight bounds on the noisy query complexity of several fundamental problems. Our first contribution is to show that any Boolean function with total influence $惟(n)$ has noisy query complexity $螛(n\log n)$. Previous w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04632v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04632v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04632v2-abstract-full" style="display: none;"> In the noisy query model, the (binary) return value of every query (possibly repeated) is independently flipped with some fixed probability $p \in (0, 1/2)$. In this paper, we obtain tight bounds on the noisy query complexity of several fundamental problems. Our first contribution is to show that any Boolean function with total influence $惟(n)$ has noisy query complexity $螛(n\log n)$. Previous works often focus on specific problems, and it is of great interest to have a characterization of noisy query complexity for general functions. Our result is the first noisy query complexity lower bound of this generality, beyond what was known for random Boolean functions [Reischuk and Schmeltz, FOCS 1991]. Our second contribution is to prove that Graph Connectivity has noisy query complexity $螛(n^2 \log n)$. In this problem, the goal is to determine whether an undirected graph is connected using noisy edge queries. While the upper bound can be achieved by a simple algorithm, no non-trivial lower bounds were known prior to this work. Last but not least, we determine the exact number of noisy queries (up to lower order terms) needed to solve the $k$-Threshold problem and the Counting problem. The $k$-Threshold problem asks to decide whether there are at least $k$ ones among $n$ bits, given noisy query access to the bits. We prove that $(1\pm o(1)) \frac{n\log (\min\{k,n-k+1\}/未)}{(1-2p)\log \frac{1-p}p}$ queries are both sufficient and necessary to achieve error probability $未= o(1)$. Previously, such a result was only known when $\min\{k,n-k+1\}=o(n)$ [Wang, Ghaddar, Zhu and Wang, arXiv 2024]. We also show a similar $(1\pm o(1)) \frac{n\log (\min\{k+1,n-k+1\}/未)}{(1-2p)\log \frac{1-p}p}$ bound for the Counting problem, where one needs to count the number of ones among $n$ bits given noisy query access and $k$ denotes the answer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04632v2-abstract-full').style.display = 'none'; document.getElementById('2502.04632v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Abstract abridged to satisfy arXiv requirements. Fixed an error in Open Problem 2 pointed out by Ziao Wang</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04495">arXiv:2502.04495</a> <span> [<a href="https://arxiv.org/pdf/2502.04495">pdf</a>, <a href="https://arxiv.org/format/2502.04495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Discovering Physics Laws of Dynamical Systems via Invariant Function Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+S">Shurui Gui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiner Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shuiwang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04495v1-abstract-short" style="display: inline;"> We consider learning underlying laws of dynamical systems governed by ordinary differential equations (ODE). A key challenge is how to discover intrinsic dynamics across multiple environments while circumventing environment-specific mechanisms. Unlike prior work, we tackle more complex environments where changes extend beyond function coefficients to entirely different function forms. For example,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04495v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04495v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04495v1-abstract-full" style="display: none;"> We consider learning underlying laws of dynamical systems governed by ordinary differential equations (ODE). A key challenge is how to discover intrinsic dynamics across multiple environments while circumventing environment-specific mechanisms. Unlike prior work, we tackle more complex environments where changes extend beyond function coefficients to entirely different function forms. For example, we demonstrate the discovery of ideal pendulum's natural motion $伪^2 \sin{胃_t}$ by observing pendulum dynamics in different environments, such as the damped environment $伪^2 \sin(胃_t) - 蟻蠅_t$ and powered environment $伪^2 \sin(胃_t) + 蟻\frac{蠅_t}{\left|蠅_t\right|}$. Here, we formulate this problem as an \emph{invariant function learning} task and propose a new method, known as \textbf{D}isentanglement of \textbf{I}nvariant \textbf{F}unctions (DIF), that is grounded in causal analysis. We propose a causal graph and design an encoder-decoder hypernetwork that explicitly disentangles invariant functions from environment-specific dynamics. The discovery of invariant functions is guaranteed by our information-based principle that enforces the independence between extracted invariant functions and environments. Quantitative comparisons with meta-learning and invariant learning baselines on three ODE systems demonstrate the effectiveness and efficiency of our method. Furthermore, symbolic regression explanation results highlight the ability of our framework to uncover intrinsic laws. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04495v1-abstract-full').style.display = 'none'; document.getElementById('2502.04495v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04420">arXiv:2502.04420</a> <span> [<a href="https://arxiv.org/pdf/2502.04420">pdf</a>, <a href="https://arxiv.org/format/2502.04420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> KVTuner: Sensitivity-Aware Layer-wise Mixed Precision KV Cache Quantization for Efficient and Nearly Lossless LLM Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xing Li</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Z">Zeyu Xing</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiming Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Linping Qu</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+H">Hui-Ling Zhen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wulong Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yiwu Yao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S+J">Sinno Jialin Pan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mingxuan Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04420v1-abstract-short" style="display: inline;"> KV cache quantization can improve Large Language Models (LLMs) inference throughput and latency in long contexts and large batch-size scenarios while preserving LLMs effectiveness. However, current methods have three unsolved issues: overlooking layer-wise sensitivity to KV cache quantization, high overhead of online fine-grained decision-making, and low flexibility to different LLMs and constrain… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04420v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04420v1-abstract-full" style="display: none;"> KV cache quantization can improve Large Language Models (LLMs) inference throughput and latency in long contexts and large batch-size scenarios while preserving LLMs effectiveness. However, current methods have three unsolved issues: overlooking layer-wise sensitivity to KV cache quantization, high overhead of online fine-grained decision-making, and low flexibility to different LLMs and constraints. Therefore, we thoroughly analyze the inherent correlation of layer-wise transformer attention patterns to KV cache quantization errors and study why key cache is more important than value cache for quantization error reduction. We further propose a simple yet effective framework KVTuner to adaptively search for the optimal hardware-friendly layer-wise KV quantization precision pairs for coarse-grained KV cache with multi-objective optimization and directly utilize the offline searched configurations during online inference. To reduce the computational cost of offline calibration, we utilize the intra-layer KV precision pair pruning and inter-layer clustering to reduce the search space. Experimental results show that we can achieve nearly lossless 3.25-bit mixed precision KV cache quantization for LLMs like Llama-3.1-8B-Instruct and 4.0-bit for sensitive models like Qwen2.5-7B-Instruct on mathematical reasoning tasks. The maximum inference throughput can be improved by 38.3% compared with KV8 quantization over various context lengths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04420v1-abstract-full').style.display = 'none'; document.getElementById('2502.04420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04347">arXiv:2502.04347</a> <span> [<a href="https://arxiv.org/pdf/2502.04347">pdf</a>, <a href="https://arxiv.org/format/2502.04347">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SCALM: Detecting Bad Practices in Smart Contracts Through LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongwei Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenkai Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04347v1-abstract-short" style="display: inline;"> As the Ethereum platform continues to mature and gain widespread usage, it is crucial to maintain high standards of smart contract writing practices. While bad practices in smart contracts may not directly lead to security issues, they do elevate the risk of encountering problems. Therefore, to understand and avoid these bad practices, this paper introduces the first systematic study of bad practi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04347v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04347v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04347v1-abstract-full" style="display: none;"> As the Ethereum platform continues to mature and gain widespread usage, it is crucial to maintain high standards of smart contract writing practices. While bad practices in smart contracts may not directly lead to security issues, they do elevate the risk of encountering problems. Therefore, to understand and avoid these bad practices, this paper introduces the first systematic study of bad practices in smart contracts, delving into over 35 specific issues. Specifically, we propose a large language models (LLMs)-based framework, SCALM. It combines Step-Back Prompting and Retrieval-Augmented Generation (RAG) to identify and address various bad practices effectively. Our extensive experiments using multiple LLMs and datasets have shown that SCALM outperforms existing tools in detecting bad practices in smart contracts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04347v1-abstract-full').style.display = 'none'; document.getElementById('2502.04347v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04276">arXiv:2502.04276</a> <span> [<a href="https://arxiv.org/pdf/2502.04276">pdf</a>, <a href="https://arxiv.org/format/2502.04276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Commutative Algebra">math.AC</span> </div> </div> <p class="title is-5 mathjax"> Gaussian Process Regression for Inverse Problems in Linear PDEs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Lange-Hegermann%2C+M">Markus Lange-Hegermann</a>, <a href="/search/cs?searchtype=author&query=Rai%C5%A3%C4%83%2C+B">Bogdan Rai牛膬</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04276v1-abstract-short" style="display: inline;"> This paper introduces a computationally efficient algorithm in system theory for solving inverse problems governed by linear partial differential equations (PDEs). We model solutions of linear PDEs using Gaussian processes with priors defined based on advanced commutative algebra and algebraic analysis. The implementation of these priors is algorithmic and achieved using the Macaulay2 computer alg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04276v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04276v1-abstract-full" style="display: none;"> This paper introduces a computationally efficient algorithm in system theory for solving inverse problems governed by linear partial differential equations (PDEs). We model solutions of linear PDEs using Gaussian processes with priors defined based on advanced commutative algebra and algebraic analysis. The implementation of these priors is algorithmic and achieved using the Macaulay2 computer algebra software. An example application includes identifying the wave speed from noisy data for classical wave equations, which are widely used in physics. The method achieves high accuracy while enhancing computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04276v1-abstract-full').style.display = 'none'; document.getElementById('2502.04276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04155">arXiv:2502.04155</a> <span> [<a href="https://arxiv.org/pdf/2502.04155">pdf</a>, <a href="https://arxiv.org/format/2502.04155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> User-Friendly Game-Theoretic Modeling and Analysis of Multi-Modal Transportation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zambrano%2C+M">Margarita Zambrano</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinling Li</a>, <a href="/search/cs?searchtype=author&query=Fiorista%2C+R">Riccardo Fiorista</a>, <a href="/search/cs?searchtype=author&query=Zardini%2C+G">Gioele Zardini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04155v1-abstract-short" style="display: inline;"> The evolution of existing transportation systems, mainly driven by urbanization and increased availability of mobility options, such as private, profit-maximizing ride-hailing companies, calls for tools to reason about their design and regulation. To study this complex socio-technical problem, one needs to account for the strategic interactions of the stakeholders involved in the mobility ecosyste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04155v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04155v1-abstract-full" style="display: none;"> The evolution of existing transportation systems, mainly driven by urbanization and increased availability of mobility options, such as private, profit-maximizing ride-hailing companies, calls for tools to reason about their design and regulation. To study this complex socio-technical problem, one needs to account for the strategic interactions of the stakeholders involved in the mobility ecosystem. In this paper, we present a game-theoretic framework to model multi-modal mobility systems, focusing on municipalities, service providers, and travelers. Through a user-friendly, Graphical User Interface, one can visualize system dynamics and compute equilibria for various scenarios. The framework enables stakeholders to assess the impact of local decisions (e.g., fleet size for services or taxes for private companies) on the full mobility system. Furthermore, this project aims to foster STEM interest among high school students (e.g., in the context of prior activities in Switzerland, and planned activities with the MIT museum). This initiative combines theoretical advancements, practical applications, and educational outreach to improve mobility system design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04155v1-abstract-full').style.display = 'none'; document.getElementById('2502.04155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04093">arXiv:2502.04093</a> <span> [<a href="https://arxiv.org/pdf/2502.04093">pdf</a>, <a href="https://arxiv.org/format/2502.04093">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> PSZ: Enhancing the SZ Scientific Lossy Compressor With Progressive Data Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoxun Yang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+S">Sheng Di</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Longtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Ximiao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiajun Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jinyang Liu</a>, <a href="/search/cs?searchtype=author&query=Cappello%2C+F">Franck Cappello</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kai Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04093v2-abstract-short" style="display: inline;"> Compression is a crucial solution for data reduction in modern scientific applications due to the exponential growth of data from simulations, experiments, and observations. Compression with progressive retrieval capability allows users to access coarse approximations of data quickly and then incrementally refine these approximations to higher fidelity. Existing progressive compression solutions s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04093v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04093v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04093v2-abstract-full" style="display: none;"> Compression is a crucial solution for data reduction in modern scientific applications due to the exponential growth of data from simulations, experiments, and observations. Compression with progressive retrieval capability allows users to access coarse approximations of data quickly and then incrementally refine these approximations to higher fidelity. Existing progressive compression solutions suffer from low reduction ratios or high operation costs, effectively undermining the approach's benefits. In this paper, we propose the first-ever interpolation-based progressive lossy compression solution that has both high reduction ratios and low operation costs. The interpolation-based algorithm has been verified as one of the best for scientific data reduction, but previously no effort exists to make it support progressive retrieval. Our contributions are three-fold: (1) We thoroughly analyze the error characteristics of the interpolation algorithm and propose our solution IPComp with multi-level bitplane and predictive coding. (2) We derive optimized strategies toward minimum data retrieval under different fidelity levels indicated by users through error bounds and bitrates. (3) We evaluate the proposed solution using six real-world datasets from four diverse domains. Experimental results demonstrate our solution archives up to $487\%$ higher compression ratios and $698\%$ faster speed than other state-of-the-art progressive compressors, and reduces the data volume for retrieval by up to $83\%$ compared to baselines under the same error bound, and reduces the error by up to $99\%$ under the same bitrate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04093v2-abstract-full').style.display = 'none'; document.getElementById('2502.04093v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository