Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,547 results for author: <span class="mathjax">Yang, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.13383">arXiv:2503.13383</a> <span> [<a href="https://arxiv.org/pdf/2503.13383">pdf</a>, <a href="https://arxiv.org/format/2503.13383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Cream of the Crop: Harvesting Rich, Scalable and Transferable Multi-Modal Data for Instruction Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+M">Mengyao Lyu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yan Li</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Huasong Zhong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhao Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.13383v1-abstract-short" style="display: inline;"> The hypothesis that pretrained large language models (LLMs) necessitate only minimal supervision during the fine-tuning (SFT) stage (Zhou et al., 2024) has been substantiated by recent advancements in data curation and selection research. However, their stability and generalizability are compromised due to the vulnerability to experimental setups and validation protocols, falling short of surpassi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13383v1-abstract-full').style.display = 'inline'; document.getElementById('2503.13383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.13383v1-abstract-full" style="display: none;"> The hypothesis that pretrained large language models (LLMs) necessitate only minimal supervision during the fine-tuning (SFT) stage (Zhou et al., 2024) has been substantiated by recent advancements in data curation and selection research. However, their stability and generalizability are compromised due to the vulnerability to experimental setups and validation protocols, falling short of surpassing random sampling (Diddee & Ippolito, 2024; Xia et al., 2024b). Built upon LLMs, multi-modal LLMs (MLLMs), combined with the sheer token volume and heightened heterogeneity of data sources, amplify both the significance and complexity of data selection. To harvest multi-modal instructional data in a robust and efficient manner, we re-define the granularity of the quality metric by decomposing it into 14 vision-language-related capabilities, and introduce multi-modal rich scorers to evaluate the capabilities of each data candidate. To promote diversity, in light of the inherent objective of the alignment stage, we take interaction style as diversity indicator and use a multi-modal rich styler to identify data instruction patterns. In doing so, our multi-modal rich scorers and styler (mmSSR) guarantee that high-scoring information is conveyed to users in diversified forms. Free from embedding-based clustering or greedy sampling, mmSSR efficiently scales to millions of data with varying budget constraints, supports customization for general or specific capability acquisition, and facilitates training-free generalization to new domains for curation. Across 10+ experimental settings, validated by 14 multi-modal benchmarks, we demonstrate consistent improvements over random sampling, baseline strategies and state-of-the-art selection methods, achieving 99.1% of full performance with only 30% of the 2.6M data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.13383v1-abstract-full').style.display = 'none'; document.getElementById('2503.13383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">update comparison with sota and analysis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12885">arXiv:2503.12885</a> <span> [<a href="https://arxiv.org/pdf/2503.12885">pdf</a>, <a href="https://arxiv.org/format/2503.12885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DreamRenderer: Taming Multi-Instance Attribute Control in Large-Scale Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Dewei Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingwei Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12885v1-abstract-short" style="display: inline;"> Image-conditioned generation methods, such as depth- and canny-conditioned approaches, have demonstrated remarkable abilities for precise image synthesis. However, existing models still struggle to accurately control the content of multiple instances (or regions). Even state-of-the-art models like FLUX and 3DIS face challenges, such as attribute leakage between instances, which limits user control… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12885v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12885v1-abstract-full" style="display: none;"> Image-conditioned generation methods, such as depth- and canny-conditioned approaches, have demonstrated remarkable abilities for precise image synthesis. However, existing models still struggle to accurately control the content of multiple instances (or regions). Even state-of-the-art models like FLUX and 3DIS face challenges, such as attribute leakage between instances, which limits user control. To address these issues, we introduce DreamRenderer, a training-free approach built upon the FLUX model. DreamRenderer enables users to control the content of each instance via bounding boxes or masks, while ensuring overall visual harmony. We propose two key innovations: 1) Bridge Image Tokens for Hard Text Attribute Binding, which uses replicated image tokens as bridge tokens to ensure that T5 text embeddings, pre-trained solely on text data, bind the correct visual attributes for each instance during Joint Attention; 2) Hard Image Attribute Binding applied only to vital layers. Through our analysis of FLUX, we identify the critical layers responsible for instance attribute rendering and apply Hard Image Attribute Binding only in these layers, using soft binding in the others. This approach ensures precise control while preserving image quality. Evaluations on the COCO-POS and COCO-MIG benchmarks demonstrate that DreamRenderer improves the Image Success Ratio by 17.7% over FLUX and enhances the performance of layout-to-image models like GLIGEN and 3DIS by up to 26.8%. Project Page: https://limuloo.github.io/DreamRenderer/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12885v1-abstract-full').style.display = 'none'; document.getElementById('2503.12885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12734">arXiv:2503.12734</a> <span> [<a href="https://arxiv.org/pdf/2503.12734">pdf</a>, <a href="https://arxiv.org/format/2503.12734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> In-Context Linear Regression Demystified: Training Dynamics and Mechanistic Interpretability of Multi-Head Softmax Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Jianliang He</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xintian Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12734v1-abstract-short" style="display: inline;"> We study how multi-head softmax attention models are trained to perform in-context learning on linear data. Through extensive empirical experiments and rigorous theoretical analysis, we demystify the emergence of elegant attention patterns: a diagonal and homogeneous pattern in the key-query (KQ) weights, and a last-entry-only and zero-sum pattern in the output-value (OV) weights. Remarkably, thes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12734v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12734v1-abstract-full" style="display: none;"> We study how multi-head softmax attention models are trained to perform in-context learning on linear data. Through extensive empirical experiments and rigorous theoretical analysis, we demystify the emergence of elegant attention patterns: a diagonal and homogeneous pattern in the key-query (KQ) weights, and a last-entry-only and zero-sum pattern in the output-value (OV) weights. Remarkably, these patterns consistently appear from gradient-based training starting from random initialization. Our analysis reveals that such emergent structures enable multi-head attention to approximately implement a debiased gradient descent predictor -- one that outperforms single-head attention and nearly achieves Bayesian optimality up to proportional factor. Furthermore, compared to linear transformers, the softmax attention readily generalizes to sequences longer than those seen during training. We also extend our study to scenarios with non-isotropic covariates and multi-task linear regression. In the former, multi-head attention learns to implement a form of pre-conditioned gradient descent. In the latter, we uncover an intriguing regime where the interplay between head number and task number triggers a superposition phenomenon that efficiently resolves multi-task in-context learning. Our results reveal that in-context learning ability emerges from the trained transformer as an aggregated effect of its architecture and the underlying data distribution, paving the way for deeper understanding and broader applications of in-context learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12734v1-abstract-full').style.display = 'none'; document.getElementById('2503.12734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12228">arXiv:2503.12228</a> <span> [<a href="https://arxiv.org/pdf/2503.12228">pdf</a>, <a href="https://arxiv.org/format/2503.12228">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Fault Tolerance Mechanisms of Large Language Models in Cloud Computing Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yihong Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ze Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinhe Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Shuyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12228v1-abstract-short" style="display: inline;"> With the rapid evolution of Large Language Models (LLMs) and their large-scale experimentation in cloud-computing spaces, the challenge of guaranteeing their security and efficiency in a failure scenario has become a main issue. To ensure the reliability and availability of large-scale language models in cloud computing scenarios, such as frequent resource failures, network problems, and computati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12228v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12228v1-abstract-full" style="display: none;"> With the rapid evolution of Large Language Models (LLMs) and their large-scale experimentation in cloud-computing spaces, the challenge of guaranteeing their security and efficiency in a failure scenario has become a main issue. To ensure the reliability and availability of large-scale language models in cloud computing scenarios, such as frequent resource failures, network problems, and computational overheads, this study proposes a novel adaptive fault tolerance mechanism. It builds upon known fault-tolerant mechanisms, such as checkpointing, redundancy, and state transposition, introducing dynamic resource allocation and prediction of failure based on real-time performance metrics. The hybrid model integrates data driven deep learning-based anomaly detection technique underlining the contribution of cloud orchestration middleware for predictive prevention of system failures. Additionally, the model integrates adaptive checkpointing and recovery strategies that dynamically adapt according to load and system state to minimize the influence on the performance of the model and minimize downtime. The experimental results demonstrate that the designed model considerably enhances the fault tolerance in large-scale cloud surroundings, and decreases the system downtime by $\mathbf{30\%}$, and has a better modeling availability than the classical fault tolerance mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12228v1-abstract-full').style.display = 'none'; document.getElementById('2503.12228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE ICCEA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12226">arXiv:2503.12226</a> <span> [<a href="https://arxiv.org/pdf/2503.12226">pdf</a>, <a href="https://arxiv.org/format/2503.12226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Research on Large Language Model Cross-Cloud Privacy Protection and Collaborative Training based on Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ze Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yihong Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Juntian Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12226v1-abstract-short" style="display: inline;"> The fast development of large language models (LLMs) and popularization of cloud computing have led to increasing concerns on privacy safeguarding and data security of cross-cloud model deployment and training as the key challenges. We present a new framework for addressing these issues along with enabling privacy preserving collaboration on training between distributed clouds based on federated l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12226v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12226v1-abstract-full" style="display: none;"> The fast development of large language models (LLMs) and popularization of cloud computing have led to increasing concerns on privacy safeguarding and data security of cross-cloud model deployment and training as the key challenges. We present a new framework for addressing these issues along with enabling privacy preserving collaboration on training between distributed clouds based on federated learning. Our mechanism encompasses cutting-edge cryptographic primitives, dynamic model aggregation techniques, and cross-cloud data harmonization solutions to enhance security, efficiency, and scalability to the traditional federated learning paradigm. Furthermore, we proposed a hybrid aggregation scheme to mitigate the threat of Data Leakage and to optimize the aggregation of model updates, thus achieving substantial enhancement on the model effectiveness and stability. Experimental results demonstrate that the training efficiency, privacy protection, and model accuracy of the proposed model compare favorably to those of the traditional federated learning method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12226v1-abstract-full').style.display = 'none'; document.getElementById('2503.12226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE AINIT 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11969">arXiv:2503.11969</a> <span> [<a href="https://arxiv.org/pdf/2503.11969">pdf</a>, <a href="https://arxiv.org/format/2503.11969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluation of Intra-operative Patient-specific Methods for Point Cloud Completion for Minimally Invasive Liver Interventions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Poudel%2C+N">Nakul Poudel</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zixin Yang</a>, <a href="/search/cs?searchtype=author&query=Merrell%2C+K">Kelly Merrell</a>, <a href="/search/cs?searchtype=author&query=Simon%2C+R">Richard Simon</a>, <a href="/search/cs?searchtype=author&query=Linte%2C+C+A">Cristian A. Linte</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11969v1-abstract-short" style="display: inline;"> The registration between the pre-operative model and the intra-operative surface is crucial in image-guided liver surgery, as it facilitates the effective use of pre-operative information during the procedure. However, the intra-operative surface, usually represented as a point cloud, often has limited coverage, especially in laparoscopic surgery, and is prone to holes and noise, posing significan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11969v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11969v1-abstract-full" style="display: none;"> The registration between the pre-operative model and the intra-operative surface is crucial in image-guided liver surgery, as it facilitates the effective use of pre-operative information during the procedure. However, the intra-operative surface, usually represented as a point cloud, often has limited coverage, especially in laparoscopic surgery, and is prone to holes and noise, posing significant challenges for registration methods. Point cloud completion methods have the potential to alleviate these issues. Thus, we explore six state-of-the-art point cloud completion methods to identify the optimal completion method for liver surgery applications. We focus on a patient-specific approach for liver point cloud completion from a partial liver surface under three cases: canonical pose, non-canonical pose, and canonical pose with noise. The transformer-based method, AdaPoinTr, outperforms all other methods to generate a complete point cloud from the given partial liver point cloud under the canonical pose. On the other hand, our findings reveal substantial performance degradation of these methods under non-canonical poses and noisy settings, highlighting the limitations of these methods, which suggests the need for a robust point completion method for its application in image-guided liver surgery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11969v1-abstract-full').style.display = 'none'; document.getElementById('2503.11969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.11182">arXiv:2503.11182</a> <span> [<a href="https://arxiv.org/pdf/2503.11182">pdf</a>, <a href="https://arxiv.org/format/2503.11182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Palette of Language Models: A Solver for Controlled Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yaqin Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaoting Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Junlan Feng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chao Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.11182v1-abstract-short" style="display: inline;"> Recent advancements in large language models have revolutionized text generation with their remarkable capabilities. These models can produce controlled texts that closely adhere to specific requirements when prompted appropriately. However, designing an optimal prompt to control multiple attributes simultaneously can be challenging. A common approach is to linearly combine single-attribute models… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11182v1-abstract-full').style.display = 'inline'; document.getElementById('2503.11182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.11182v1-abstract-full" style="display: none;"> Recent advancements in large language models have revolutionized text generation with their remarkable capabilities. These models can produce controlled texts that closely adhere to specific requirements when prompted appropriately. However, designing an optimal prompt to control multiple attributes simultaneously can be challenging. A common approach is to linearly combine single-attribute models, but this strategy often overlooks attribute overlaps and can lead to conflicts. Therefore, we propose a novel combination strategy inspired by the Law of Total Probability and Conditional Mutual Information Minimization on generative language models. This method has been adapted for single-attribute control scenario and is termed the Palette of Language Models due to its theoretical linkage between attribute strength and generation style, akin to blending colors on an artist's palette. Moreover, positive correlation and attribute enhancement are advanced as theoretical properties to guide a rational combination strategy design. We conduct experiments on both single control and multiple control settings, and achieve surpassing results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.11182v1-abstract-full').style.display = 'none'; document.getElementById('2503.11182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025, Main, Long Paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10704">arXiv:2503.10704</a> <span> [<a href="https://arxiv.org/pdf/2503.10704">pdf</a>, <a href="https://arxiv.org/format/2503.10704">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Error Analyses of Auto-Regressive Video Diffusion Models: A Unified Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengzhuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoli Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+V+Y+F">Vincent Y. F. Tan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+T">Tianyu Pang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+C">Chao Du</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+A">Aixin Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10704v1-abstract-short" style="display: inline;"> A variety of Auto-Regressive Video Diffusion Models (ARVDM) have achieved remarkable successes in generating realistic long-form videos. However, theoretical analyses of these models remain scant. In this work, we develop theoretical underpinnings for these models and use our insights to improve the performance of existing models. We first develop Meta-ARVDM, a unified framework of ARVDMs that sub… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10704v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10704v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10704v1-abstract-full" style="display: none;"> A variety of Auto-Regressive Video Diffusion Models (ARVDM) have achieved remarkable successes in generating realistic long-form videos. However, theoretical analyses of these models remain scant. In this work, we develop theoretical underpinnings for these models and use our insights to improve the performance of existing models. We first develop Meta-ARVDM, a unified framework of ARVDMs that subsumes most existing methods. Using Meta-ARVDM, we analyze the KL-divergence between the videos generated by Meta-ARVDM and the true videos. Our analysis uncovers two important phenomena inherent to ARVDM -- error accumulation and memory bottleneck. By deriving an information-theoretic impossibility result, we show that the memory bottleneck phenomenon cannot be avoided. To mitigate the memory bottleneck, we design various network structures to explicitly use more past frames. We also achieve a significantly improved trade-off between the mitigation of the memory bottleneck and the inference efficiency by compressing the frames. Experimental results on DMLab and Minecraft validate the efficacy of our methods. Our experiments also demonstrate a Pareto-frontier between the error accumulation and memory bottleneck across different methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10704v1-abstract-full').style.display = 'none'; document.getElementById('2503.10704v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10589">arXiv:2503.10589</a> <span> [<a href="https://arxiv.org/pdf/2503.10589">pdf</a>, <a href="https://arxiv.org/format/2503.10589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long Context Tuning for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuwei Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Ceyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyan Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhibei Ma</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhijie Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenheng Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lu Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10589v1-abstract-short" style="display: inline;"> Recent advances in video generation can produce realistic, minute-long single-shot videos with scalable diffusion transformers. However, real-world narrative videos require multi-shot scenes with visual and dynamic consistency across shots. In this work, we introduce Long Context Tuning (LCT), a training paradigm that expands the context window of pre-trained single-shot video diffusion models to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10589v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10589v1-abstract-full" style="display: none;"> Recent advances in video generation can produce realistic, minute-long single-shot videos with scalable diffusion transformers. However, real-world narrative videos require multi-shot scenes with visual and dynamic consistency across shots. In this work, we introduce Long Context Tuning (LCT), a training paradigm that expands the context window of pre-trained single-shot video diffusion models to learn scene-level consistency directly from data. Our method expands full attention mechanisms from individual shots to encompass all shots within a scene, incorporating interleaved 3D position embedding and an asynchronous noise strategy, enabling both joint and auto-regressive shot generation without additional parameters. Models with bidirectional attention after LCT can further be fine-tuned with context-causal attention, facilitating auto-regressive generation with efficient KV-cache. Experiments demonstrate single-shot models after LCT can produce coherent multi-shot scenes and exhibit emerging capabilities, including compositional generation and interactive shot extension, paving the way for more practical visual content creation. See https://guoyww.github.io/projects/long-context-video/ for more details. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10589v1-abstract-full').style.display = 'none'; document.getElementById('2503.10589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://guoyww.github.io/projects/long-context-video/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10304">arXiv:2503.10304</a> <span> [<a href="https://arxiv.org/pdf/2503.10304">pdf</a>, <a href="https://arxiv.org/format/2503.10304">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Nash Equilibrium Constrained Auto-bidding With Bi-level Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mou%2C+Z">Zhiyu Mou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Miao Xu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+R">Rongquan Bai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chuan Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10304v1-abstract-short" style="display: inline;"> Many online advertising platforms provide advertisers with auto-bidding services to enhance their advertising performance. However, most existing auto-bidding algorithms fail to accurately capture the auto-bidding problem formulation that the platform truly faces, let alone solve it. Actually, we argue that the platform should try to help optimize each advertiser's performance to the greatest exte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10304v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10304v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10304v1-abstract-full" style="display: none;"> Many online advertising platforms provide advertisers with auto-bidding services to enhance their advertising performance. However, most existing auto-bidding algorithms fail to accurately capture the auto-bidding problem formulation that the platform truly faces, let alone solve it. Actually, we argue that the platform should try to help optimize each advertiser's performance to the greatest extent -- which makes $蔚$-Nash Equilibrium ($蔚$-NE) a necessary solution concept -- while maximizing the social welfare of all the advertisers for the platform's long-term value. Based on this, we introduce the \emph{Nash-Equilibrium Constrained Bidding} (NCB), a new formulation of the auto-bidding problem from the platform's perspective. Specifically, it aims to maximize the social welfare of all advertisers under the $蔚$-NE constraint. However, the NCB problem presents significant challenges due to its constrained bi-level structure and the typically large number of advertisers involved. To address these challenges, we propose a \emph{Bi-level Policy Gradient} (BPG) framework with theoretical guarantees. Notably, its computational complexity is independent of the number of advertisers, and the associated gradients are straightforward to compute. Extensive simulated and real-world experiments validate the effectiveness of the BPG framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10304v1-abstract-full').style.display = 'none'; document.getElementById('2503.10304v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.10045">arXiv:2503.10045</a> <span> [<a href="https://arxiv.org/pdf/2503.10045">pdf</a>, <a href="https://arxiv.org/format/2503.10045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CPLOYO: A Pulmonary Nodule Detection Model with Multi-Scale Feature Fusion and Nonlinear Feature Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruifeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yaoting Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.10045v1-abstract-short" style="display: inline;"> The integration of Internet of Things (IoT) technology in pulmonary nodule detection significantly enhances the intelligence and real-time capabilities of the detection system. Currently, lung nodule detection primarily focuses on the identification of solid nodules, but different types of lung nodules correspond to various forms of lung cancer. Multi-type detection contributes to improving the ov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10045v1-abstract-full').style.display = 'inline'; document.getElementById('2503.10045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.10045v1-abstract-full" style="display: none;"> The integration of Internet of Things (IoT) technology in pulmonary nodule detection significantly enhances the intelligence and real-time capabilities of the detection system. Currently, lung nodule detection primarily focuses on the identification of solid nodules, but different types of lung nodules correspond to various forms of lung cancer. Multi-type detection contributes to improving the overall lung cancer detection rate and enhancing the cure rate. To achieve high sensitivity in nodule detection, targeted improvements were made to the YOLOv8 model. Firstly, the C2f\_RepViTCAMF module was introduced to augment the C2f module in the backbone, thereby enhancing detection accuracy for small lung nodules and achieving a lightweight model design. Secondly, the MSCAF module was incorporated to reconstruct the feature fusion section of the model, improving detection accuracy for lung nodules of varying scales. Furthermore, the KAN network was integrated into the model. By leveraging the KAN network's powerful nonlinear feature learning capability, detection accuracy for small lung nodules was further improved, and the model's generalization ability was enhanced. Tests conducted on the LUNA16 dataset demonstrate that the improved model outperforms the original model as well as other mainstream models such as YOLOv9 and RT-DETR across various evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.10045v1-abstract-full').style.display = 'none'; document.getElementById('2503.10045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09675">arXiv:2503.09675</a> <span> [<a href="https://arxiv.org/pdf/2503.09675">pdf</a>, <a href="https://arxiv.org/format/2503.09675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Diffusion Sampling via Exploiting Local Transition Coherence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shangwen Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhantao Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qianyu Peng</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Z">Zhao Pu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huangji Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+F">Fan Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09675v1-abstract-short" style="display: inline;"> Text-based diffusion models have made significant breakthroughs in generating high-quality images and videos from textual descriptions. However, the lengthy sampling time of the denoising process remains a significant bottleneck in practical applications. Previous methods either ignore the statistical relationships between adjacent steps or rely on attention or feature similarity between them, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09675v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09675v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09675v1-abstract-full" style="display: none;"> Text-based diffusion models have made significant breakthroughs in generating high-quality images and videos from textual descriptions. However, the lengthy sampling time of the denoising process remains a significant bottleneck in practical applications. Previous methods either ignore the statistical relationships between adjacent steps or rely on attention or feature similarity between them, which often only works with specific network structures. To address this issue, we discover a new statistical relationship in the transition operator between adjacent steps, focusing on the relationship of the outputs from the network. This relationship does not impose any requirements on the network structure. Based on this observation, we propose a novel training-free acceleration method called LTC-Accel, which uses the identified relationship to estimate the current transition operator based on adjacent steps. Due to no specific assumptions regarding the network structure, LTC-Accel is applicable to almost all diffusion-based methods and orthogonal to almost all existing acceleration techniques, making it easy to combine with them. Experimental results demonstrate that LTC-Accel significantly speeds up sampling in text-to-image and text-to-video synthesis while maintaining competitive sample quality. Specifically, LTC-Accel achieves a speedup of 1.67-fold in Stable Diffusion v2 and a speedup of 1.55-fold in video generation models. When combined with distillation models, LTC-Accel achieves a remarkable 10-fold speedup in video generation, allowing real-time generation of more than 16FPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09675v1-abstract-full').style.display = 'none'; document.getElementById('2503.09675v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09643">arXiv:2503.09643</a> <span> [<a href="https://arxiv.org/pdf/2503.09643">pdf</a>, <a href="https://arxiv.org/format/2503.09643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedMSGL: A Self-Expressive Hypergraph Based Federated Multi-View Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Daoyuan Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zuyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Shengli Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09643v1-abstract-short" style="display: inline;"> Federated learning is essential for enabling collaborative model training across decentralized data sources while preserving data privacy and security. This approach mitigates the risks associated with centralized data collection and addresses concerns related to data ownership and compliance. Despite significant advancements in federated learning algorithms that address communication bottlenecks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09643v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09643v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09643v1-abstract-full" style="display: none;"> Federated learning is essential for enabling collaborative model training across decentralized data sources while preserving data privacy and security. This approach mitigates the risks associated with centralized data collection and addresses concerns related to data ownership and compliance. Despite significant advancements in federated learning algorithms that address communication bottlenecks and enhance privacy protection, existing works overlook the impact of differences in data feature dimensions, resulting in global models that disproportionately depend on participants with large feature dimensions. Additionally, current single-view federated learning methods fail to account for the unique characteristics of multi-view data, leading to suboptimal performance in processing such data. To address these issues, we propose a Self-expressive Hypergraph Based Federated Multi-view Learning method (FedMSGL). The proposed method leverages self-expressive character in the local training to learn uniform dimension subspace with latent sample relation. At the central side, an adaptive fusion technique is employed to generate the global model, while constructing a hypergraph from the learned global and view-specific subspace to capture intricate interconnections across views. Experiments on multi-view datasets with different feature dimensions validated the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09643v1-abstract-full').style.display = 'none'; document.getElementById('2503.09643v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09634">arXiv:2503.09634</a> <span> [<a href="https://arxiv.org/pdf/2503.09634">pdf</a>, <a href="https://arxiv.org/format/2503.09634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Identity Preserving Latent Diffusion for Brain Aging Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+G">Gexin Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhangsihao Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a>, <a href="/search/cs?searchtype=author&query=Gerig%2C+G">Guido Gerig</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaoxiao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09634v1-abstract-short" style="display: inline;"> Structural and appearance changes in brain imaging over time are crucial indicators of neurodevelopment and neurodegeneration. The rapid advancement of large-scale generative models provides a promising backbone for modeling these complex global and local changes in brain images, such as transforming the age of a source image to a target age. However, current generative models, typically trained o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09634v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09634v1-abstract-full" style="display: none;"> Structural and appearance changes in brain imaging over time are crucial indicators of neurodevelopment and neurodegeneration. The rapid advancement of large-scale generative models provides a promising backbone for modeling these complex global and local changes in brain images, such as transforming the age of a source image to a target age. However, current generative models, typically trained on independently and identically distributed (i.i.d.) data, may struggle to maintain intra-subject spatiotemporal consistency during transformations. We propose the Identity-Preserving Longitudinal Diffusion Model (IP-LDM), designed to accurately transform brain ages while preserving subject identity. Our approach involves first extracting the identity representation from the source image. Then, conditioned on the target age, the latent diffusion model learns to generate the age-transformed target image. To ensure consistency within the same subject over time, we regularize the identity representation using a triplet contrastive formulation. Our experiments on both elderly and infant brain datasets demonstrate that our model outperforms existing conditional generative models, producing realistic age transformations while preserving intra-subject identity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09634v1-abstract-full').style.display = 'none'; document.getElementById('2503.09634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09573">arXiv:2503.09573</a> <span> [<a href="https://arxiv.org/pdf/2503.09573">pdf</a>, <a href="https://arxiv.org/format/2503.09573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Block Diffusion: Interpolating Between Autoregressive and Diffusion Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arriola%2C+M">Marianne Arriola</a>, <a href="/search/cs?searchtype=author&query=Gokaslan%2C+A">Aaron Gokaslan</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+J+T">Justin T Chiu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhihan Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhixuan Qi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jiaqi Han</a>, <a href="/search/cs?searchtype=author&query=Sahoo%2C+S+S">Subham Sekhar Sahoo</a>, <a href="/search/cs?searchtype=author&query=Kuleshov%2C+V">Volodymyr Kuleshov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09573v1-abstract-short" style="display: inline;"> Diffusion language models offer unique benefits over autoregressive models due to their potential for parallelized generation and controllability, yet they lag in likelihood modeling and are limited to fixed-length generation. In this work, we introduce a class of block diffusion language models that interpolate between discrete denoising diffusion and autoregressive models. Block diffusion overco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09573v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09573v1-abstract-full" style="display: none;"> Diffusion language models offer unique benefits over autoregressive models due to their potential for parallelized generation and controllability, yet they lag in likelihood modeling and are limited to fixed-length generation. In this work, we introduce a class of block diffusion language models that interpolate between discrete denoising diffusion and autoregressive models. Block diffusion overcomes key limitations of both approaches by supporting flexible-length generation and improving inference efficiency with KV caching and parallel token sampling. We propose a recipe for building effective block diffusion models that includes an efficient training algorithm, estimators of gradient variance, and data-driven noise schedules to minimize the variance. Block diffusion sets a new state-of-the-art performance among diffusion models on language modeling benchmarks and enables generation of arbitrary-length sequences. We provide the code, along with the model weights and blog post on the project page: https://m-arriola.com/bd3lms/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09573v1-abstract-full').style.display = 'none'; document.getElementById('2503.09573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025 Oral. We provide the code at https://github.com/kuleshov-group/bd3lms</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09492">arXiv:2503.09492</a> <span> [<a href="https://arxiv.org/pdf/2503.09492">pdf</a>, <a href="https://arxiv.org/format/2503.09492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning Cascade Ranking as One Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunli Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zixuan Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+S">Shiyang Wen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng Jiang</a>, <a href="/search/cs?searchtype=author&query=Gai%2C+K">Kun Gai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09492v1-abstract-short" style="display: inline;"> Cascade Ranking is a prevalent architecture in large-scale top-k selection systems like recommendation and advertising platforms. Traditional training methods focus on single-stage optimization, neglecting interactions between stages. Recent advances such as RankFlow and FS-LTR have introduced interaction-aware training paradigms but still struggle to 1) align training objectives with the goal of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09492v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09492v1-abstract-full" style="display: none;"> Cascade Ranking is a prevalent architecture in large-scale top-k selection systems like recommendation and advertising platforms. Traditional training methods focus on single-stage optimization, neglecting interactions between stages. Recent advances such as RankFlow and FS-LTR have introduced interaction-aware training paradigms but still struggle to 1) align training objectives with the goal of the entire cascade ranking (i.e., end-to-end recall) and 2) learn effective collaboration patterns for different stages. To address these challenges, we propose LCRON, which introduces a novel surrogate loss function derived from the lower bound probability that ground truth items are selected by cascade ranking, ensuring alignment with the overall objective of the system. According to the properties of the derived bound, we further design an auxiliary loss for each stage to drive the reduction of this bound, leading to a more robust and effective top-k selection. LCRON enables end-to-end training of the entire cascade ranking system as a unified network. Experimental results demonstrate that LCRON achieves significant improvement over existing methods on public benchmarks and industrial applications, addressing key limitations in cascade ranking training and significantly enhancing system performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09492v1-abstract-full').style.display = 'none'; document.getElementById('2503.09492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09318">arXiv:2503.09318</a> <span> [<a href="https://arxiv.org/pdf/2503.09318">pdf</a>, <a href="https://arxiv.org/format/2503.09318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> FpgaHub: Fpga-centric Hyper-heterogeneous Computing Platform for Big Data Analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zeke Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongjing Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingtao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xueying Zhu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mo Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zihan Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">De Ma</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajing Tang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingsheng He</a>, <a href="/search/cs?searchtype=author&query=Alonso%2C+G">Gustavo Alonso</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09318v1-abstract-short" style="display: inline;"> Modern data analytics requires a huge amount of computing power and processes a massive amount of data. At the same time, the underlying computing platform is becoming much more heterogeneous on both hardware and software. Even though specialized hardware, e.g., FPGA- or GPU- or TPU-based systems, often achieves better performance than a CPU-only system due to the slowing of Moore's law, such syst… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09318v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09318v1-abstract-full" style="display: none;"> Modern data analytics requires a huge amount of computing power and processes a massive amount of data. At the same time, the underlying computing platform is becoming much more heterogeneous on both hardware and software. Even though specialized hardware, e.g., FPGA- or GPU- or TPU-based systems, often achieves better performance than a CPU-only system due to the slowing of Moore's law, such systems are limited in what they can do. For example, GPU-only approaches suffer from severe IO limitations. To truly exploit the potential of hardware heterogeneity, we present FpgaHub, an FPGA-centric hyper-heterogeneous computing platform for big data analytics. The key idea of FpgaHub is to use reconfigurable computing to implement a versatile hub complementing other processors (CPUs, GPUs, DPUs, programmable switches, computational storage, etc.). Using an FPGA as the basis, we can take advantage of its highly reconfigurable nature and rich IO interfaces such as PCIe, networking, and on-board memory, to place it at the center of the architecture and use it as a data and control plane for data movement, scheduling, pre-processing, etc. FpgaHub enables architectural flexibility to allow exploring the rich design space of heterogeneous computing platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09318v1-abstract-full').style.display = 'none'; document.getElementById('2503.09318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09172">arXiv:2503.09172</a> <span> [<a href="https://arxiv.org/pdf/2503.09172">pdf</a>, <a href="https://arxiv.org/ps/2503.09172">ps</a>, <a href="https://arxiv.org/format/2503.09172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> New construction of Locally Perfect Nonlinear Functions with Application to Sequences Sets with Low Ambiguity Zone </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiye Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huaning Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+K">Keqin Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09172v1-abstract-short" style="display: inline;"> Low Ambiguity Zone (LAZ) sequences play a pivotal role in modern integrated sensing and communication (ISAC) systems. Recently, Wang et al.[1] proposed a definition of locally perfect nonlinear functions (LPNFs) and constructed three classes of both periodic and aperiodic LAZ sequence sets with flexible parameters by applying such functions and interleaving method. Some of these LAZ sequence sets… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09172v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09172v1-abstract-full" style="display: none;"> Low Ambiguity Zone (LAZ) sequences play a pivotal role in modern integrated sensing and communication (ISAC) systems. Recently, Wang et al.[1] proposed a definition of locally perfect nonlinear functions (LPNFs) and constructed three classes of both periodic and aperiodic LAZ sequence sets with flexible parameters by applying such functions and interleaving method. Some of these LAZ sequence sets are asymptotically optimal with respect to the Ye-Zhou-Liu-Fan-Lei-Tang bounds undercertain conditions. In this paper, we proceed with the construction of LPNFs with new parameters. By using these LPNFs, we also present a series of LAZ sequence sets with more flexible parameters, addressing the limitations of existing parameter choices. Furthermore, our results show that one of these classes is asymptotically optimal in both the periodic and aperiodic cases, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09172v1-abstract-full').style.display = 'none'; document.getElementById('2503.09172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09017">arXiv:2503.09017</a> <span> [<a href="https://arxiv.org/pdf/2503.09017">pdf</a>, <a href="https://arxiv.org/format/2503.09017">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Accurate Control under Voltage Drop for Rotor Drones </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhang Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jindou Jia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zihan Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+K">Kexin Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09017v1-abstract-short" style="display: inline;"> This letter proposes an anti-disturbance control scheme for rotor drones to counteract voltage drop (VD) disturbance caused by voltage drop of the battery, which is a common case for long-time flight or aggressive maneuvers. Firstly, the refined dynamics of rotor drones considering VD disturbance are presented. Based on the dynamics, a voltage drop observer (VDO) is developed to accurately estimat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09017v1-abstract-full').style.display = 'inline'; document.getElementById('2503.09017v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09017v1-abstract-full" style="display: none;"> This letter proposes an anti-disturbance control scheme for rotor drones to counteract voltage drop (VD) disturbance caused by voltage drop of the battery, which is a common case for long-time flight or aggressive maneuvers. Firstly, the refined dynamics of rotor drones considering VD disturbance are presented. Based on the dynamics, a voltage drop observer (VDO) is developed to accurately estimate the VD disturbance by decoupling the disturbance and state information of the drone, reducing the conservativeness of conventional disturbance observers. Subsequently, the control scheme integrates the VDO within the translational loop and a fixed-time sliding mode observer (SMO) within the rotational loop, enabling it to address force and torque disturbances caused by voltage drop of the battery. Sufficient real flight experiments are conducted to demonstrate the effectiveness of the proposed control scheme under VD disturbance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09017v1-abstract-full').style.display = 'none'; document.getElementById('2503.09017v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08638">arXiv:2503.08638</a> <span> [<a href="https://arxiv.org/pdf/2503.08638">pdf</a>, <a href="https://arxiv.org/format/2503.08638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> YuE: Scaling Open Foundation Models for Long-Form Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+R">Ruibin Yuan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hanfeng Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jiahao Pan</a>, <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Yongyi Zang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenye Ma</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yinghao Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Z">Zeyue Tian</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziya Zhou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+X">Xingwei Qu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shangda Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+T">Tianhao Shen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+J">Jun Zhan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunhui Wang</a> , et al. (32 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08638v1-abstract-short" style="display: inline;"> We tackle the task of long-form music generation--particularly the challenging \textbf{lyrics-to-song} problem--by introducing YuE, a family of open foundation models based on the LLaMA2 architecture. Specifically, YuE scales to trillions of tokens and generates up to five minutes of music while maintaining lyrical alignment, coherent musical structure, and engaging vocal melodies with appropriate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08638v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08638v1-abstract-full" style="display: none;"> We tackle the task of long-form music generation--particularly the challenging \textbf{lyrics-to-song} problem--by introducing YuE, a family of open foundation models based on the LLaMA2 architecture. Specifically, YuE scales to trillions of tokens and generates up to five minutes of music while maintaining lyrical alignment, coherent musical structure, and engaging vocal melodies with appropriate accompaniment. It achieves this through (1) track-decoupled next-token prediction to overcome dense mixture signals, (2) structural progressive conditioning for long-context lyrical alignment, and (3) a multitask, multiphase pre-training recipe to converge and generalize. In addition, we redesign the in-context learning technique for music generation, enabling versatile style transfer (e.g., converting Japanese city pop into an English rap while preserving the original accompaniment) and bidirectional generation. Through extensive evaluation, we demonstrate that YuE matches or even surpasses some of the proprietary systems in musicality and vocal agility. In addition, fine-tuning YuE enables additional controls and enhanced support for tail languages. Furthermore, beyond generation, we show that YuE's learned representations can perform well on music understanding tasks, where the results of YuE match or exceed state-of-the-art methods on the MARBLE benchmark. Keywords: lyrics2song, song generation, long-form, foundation model, music generation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08638v1-abstract-full').style.display = 'none'; document.getElementById('2503.08638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/multimodal-art-projection/YuE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08596">arXiv:2503.08596</a> <span> [<a href="https://arxiv.org/pdf/2503.08596">pdf</a>, <a href="https://arxiv.org/format/2503.08596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X-Field: A Physically Grounded Representation for 3D X-ray Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feiran Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jiachen Tao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junyi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+B">Bin Duan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yan Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08596v1-abstract-short" style="display: inline;"> X-ray imaging is indispensable in medical diagnostics, yet its use is tightly regulated due to potential health risks. To mitigate radiation exposure, recent research focuses on generating novel views from sparse inputs and reconstructing Computed Tomography (CT) volumes, borrowing representations from the 3D reconstruction area. However, these representations originally target visible light imagi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08596v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08596v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08596v1-abstract-full" style="display: none;"> X-ray imaging is indispensable in medical diagnostics, yet its use is tightly regulated due to potential health risks. To mitigate radiation exposure, recent research focuses on generating novel views from sparse inputs and reconstructing Computed Tomography (CT) volumes, borrowing representations from the 3D reconstruction area. However, these representations originally target visible light imaging that emphasizes reflection and scattering effects, while neglecting penetration and attenuation properties of X-ray imaging. In this paper, we introduce X-Field, the first 3D representation specifically designed for X-ray imaging, rooted in the energy absorption rates across different materials. To accurately model diverse materials within internal structures, we employ 3D ellipsoids with distinct attenuation coefficients. To estimate each material's energy absorption of X-rays, we devise an efficient path partitioning algorithm accounting for complex ellipsoid intersections. We further propose hybrid progressive initialization to refine the geometric accuracy of X-Filed and incorporate material-based optimization to enhance model fitting along material boundaries. Experiments show that X-Field achieves superior visual fidelity on both real-world human organ and synthetic object datasets, outperforming state-of-the-art methods in X-ray Novel View Synthesis and CT Reconstruction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08596v1-abstract-full').style.display = 'none'; document.getElementById('2503.08596v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: \url{https://brack-wang.github.io/XField/}, Github Code: \url{https://github.com/Brack-Wang/X-Field}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08377">arXiv:2503.08377</a> <span> [<a href="https://arxiv.org/pdf/2503.08377">pdf</a>, <a href="https://arxiv.org/format/2503.08377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Layton: Latent Consistency Tokenizer for 1024-pixel Image Reconstruction and Generation by 256 Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qingsong Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhe Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haonan Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08377v3-abstract-short" style="display: inline;"> Image tokenization has significantly advanced visual generation and multimodal modeling, particularly when paired with autoregressive models. However, current methods face challenges in balancing efficiency and fidelity: high-resolution image reconstruction either requires an excessive number of tokens or compromises critical details through token reduction. To resolve this, we propose Latent Cons… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08377v3-abstract-full').style.display = 'inline'; document.getElementById('2503.08377v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08377v3-abstract-full" style="display: none;"> Image tokenization has significantly advanced visual generation and multimodal modeling, particularly when paired with autoregressive models. However, current methods face challenges in balancing efficiency and fidelity: high-resolution image reconstruction either requires an excessive number of tokens or compromises critical details through token reduction. To resolve this, we propose Latent Consistency Tokenizer (Layton) that bridges discrete visual tokens with the compact latent space of pre-trained Latent Diffusion Models (LDMs), enabling efficient representation of 1024x1024 images using only 256 tokens-a 16 times compression over VQGAN. Layton integrates a transformer encoder, a quantized codebook, and a latent consistency decoder. Direct application of LDM as the decoder results in color and brightness discrepancies. Thus, we convert it to latent consistency decoder, reducing multi-step sampling to 1-2 steps for direct pixel-level supervision. Experiments demonstrate Layton's superiority in high-fidelity reconstruction, with 10.8 reconstruction Frechet Inception Distance on MSCOCO-2017 5K benchmark for 1024x1024 image reconstruction. We also extend Layton to a text-to-image generation model, LaytonGen, working in autoregression. It achieves 0.73 score on GenEval benchmark, surpassing current state-of-the-art methods. Project homepage: https://github.com/OPPO-Mente-Lab/Layton <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08377v3-abstract-full').style.display = 'none'; document.getElementById('2503.08377v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.08111">arXiv:2503.08111</a> <span> [<a href="https://arxiv.org/pdf/2503.08111">pdf</a>, <a href="https://arxiv.org/format/2503.08111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaRI: Material Retrieval Integration across Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhifei Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huixiong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jingwei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.08111v1-abstract-short" style="display: inline;"> Accurate material retrieval is critical for creating realistic 3D assets. Existing methods rely on datasets that capture shape-invariant and lighting-varied representations of materials, which are scarce and face challenges due to limited diversity and inadequate real-world generalization. Most current approaches adopt traditional image search techniques. They fall short in capturing the unique pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08111v1-abstract-full').style.display = 'inline'; document.getElementById('2503.08111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.08111v1-abstract-full" style="display: none;"> Accurate material retrieval is critical for creating realistic 3D assets. Existing methods rely on datasets that capture shape-invariant and lighting-varied representations of materials, which are scarce and face challenges due to limited diversity and inadequate real-world generalization. Most current approaches adopt traditional image search techniques. They fall short in capturing the unique properties of material spaces, leading to suboptimal performance in retrieval tasks. Addressing these challenges, we introduce MaRI, a framework designed to bridge the feature space gap between synthetic and real-world materials. MaRI constructs a shared embedding space that harmonizes visual and material attributes through a contrastive learning strategy by jointly training an image and a material encoder, bringing similar materials and images closer while separating dissimilar pairs within the feature space. To support this, we construct a comprehensive dataset comprising high-quality synthetic materials rendered with controlled shape variations and diverse lighting conditions, along with real-world materials processed and standardized using material transfer techniques. Extensive experiments demonstrate the superior performance, accuracy, and generalization capabilities of MaRI across diverse and complex material retrieval tasks, outperforming existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.08111v1-abstract-full').style.display = 'none'; document.getElementById('2503.08111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07981">arXiv:2503.07981</a> <span> [<a href="https://arxiv.org/pdf/2503.07981">pdf</a>, <a href="https://arxiv.org/format/2503.07981">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> </div> </div> <p class="title is-5 mathjax"> Regulatory DNA sequence Design with Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhao Yang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+B">Bing Su</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chuan Cao</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07981v1-abstract-short" style="display: inline;"> Cis-regulatory elements (CREs), such as promoters and enhancers, are relatively short DNA sequences that directly regulate gene expression. The fitness of CREs, measured by their ability to modulate gene expression, highly depends on the nucleotide sequences, especially specific motifs known as transcription factor binding sites (TFBSs). Designing high-fitness CREs is crucial for therapeutic and b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07981v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07981v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07981v1-abstract-full" style="display: none;"> Cis-regulatory elements (CREs), such as promoters and enhancers, are relatively short DNA sequences that directly regulate gene expression. The fitness of CREs, measured by their ability to modulate gene expression, highly depends on the nucleotide sequences, especially specific motifs known as transcription factor binding sites (TFBSs). Designing high-fitness CREs is crucial for therapeutic and bioengineering applications. Current CRE design methods are limited by two major drawbacks: (1) they typically rely on iterative optimization strategies that modify existing sequences and are prone to local optima, and (2) they lack the guidance of biological prior knowledge in sequence optimization. In this paper, we address these limitations by proposing a generative approach that leverages reinforcement learning (RL) to fine-tune a pre-trained autoregressive (AR) model. Our method incorporates data-driven biological priors by deriving computational inference-based rewards that simulate the addition of activator TFBSs and removal of repressor TFBSs, which are then integrated into the RL process. We evaluate our method on promoter design tasks in two yeast media conditions and enhancer design tasks for three human cell types, demonstrating its ability to generate high-fitness CREs while maintaining sequence diversity. The code is available at https://github.com/yangzhao1230/TACO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07981v1-abstract-full').style.display = 'none'; document.getElementById('2503.07981v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07520">arXiv:2503.07520</a> <span> [<a href="https://arxiv.org/pdf/2503.07520">pdf</a>, <a href="https://arxiv.org/format/2503.07520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> From Limited Labels to Open Domains: An Efficient Learning Paradigm for UAV-view Geo-Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongwei Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhao-Xu Yang</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+H">Hai-Jun Rong</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+J">Jiawei Lang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07520v1-abstract-short" style="display: inline;"> Traditional UAV-view Geo-Localization (UVGL) supervised paradigms are constrained by the strict reliance on paired data for positive sample selection, which limits their ability to learn cross-view domain-invariant representations from unpaired data. Moreover, it is necessary to reconstruct the pairing relationship with expensive re-labeling costs for scenario-specific training when deploying in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07520v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07520v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07520v1-abstract-full" style="display: none;"> Traditional UAV-view Geo-Localization (UVGL) supervised paradigms are constrained by the strict reliance on paired data for positive sample selection, which limits their ability to learn cross-view domain-invariant representations from unpaired data. Moreover, it is necessary to reconstruct the pairing relationship with expensive re-labeling costs for scenario-specific training when deploying in a new domain, which fails to meet the practical demands of open-environment applications. To address this issue, we propose a novel cross-domain invariance knowledge transfer network (CDIKTNet), which comprises a cross-domain invariance sub-network and a cross-domain transfer sub-network to realize a closed-loop framework of invariance feature learning and knowledge transfer. The cross-domain invariance sub-network is utilized to construct an essentially shared feature space across domains by learning structural invariance and spatial invariance in cross-view features. Meanwhile, the cross-domain transfer sub-network uses these invariant features as anchors and employs a dual-path contrastive memory learning mechanism to mine latent cross-domain correlation patterns in unpaired data. Extensive experiments demonstrate that our method achieves state-of-the-art performance under fully supervised conditions. More importantly, with merely 2\% paired data, our method exhibits performance comparable to existing supervised paradigms and possesses the ability to transfer directly to qualify for applications in the other scenarios completely without any prior pairing relationship. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07520v1-abstract-full').style.display = 'none'; document.getElementById('2503.07520v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06709">arXiv:2503.06709</a> <span> [<a href="https://arxiv.org/pdf/2503.06709">pdf</a>, <a href="https://arxiv.org/format/2503.06709">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Delusions of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongshen Xu</a>, <a href="/search/cs?searchtype=author&query=yang%2C+Z">Zixv yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zichen Zhu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+K">Kunyao Lan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengyue Wu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Z">Ziwei Ji</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&query=Fung%2C+P">Pascale Fung</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06709v1-abstract-short" style="display: inline;"> Large Language Models often generate factually incorrect but plausible outputs, known as hallucinations. We identify a more insidious phenomenon, LLM delusion, defined as high belief hallucinations, incorrect outputs with abnormally high confidence, making them harder to detect and mitigate. Unlike ordinary hallucinations, delusions persist with low uncertainty, posing significant challenges to mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06709v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06709v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06709v1-abstract-full" style="display: none;"> Large Language Models often generate factually incorrect but plausible outputs, known as hallucinations. We identify a more insidious phenomenon, LLM delusion, defined as high belief hallucinations, incorrect outputs with abnormally high confidence, making them harder to detect and mitigate. Unlike ordinary hallucinations, delusions persist with low uncertainty, posing significant challenges to model reliability. Through empirical analysis across different model families and sizes on several Question Answering tasks, we show that delusions are prevalent and distinct from hallucinations. LLMs exhibit lower honesty with delusions, which are harder to override via finetuning or self reflection. We link delusion formation with training dynamics and dataset noise and explore mitigation strategies such as retrieval augmented generation and multi agent debating to mitigate delusions. By systematically investigating the nature, prevalence, and mitigation of LLM delusions, our study provides insights into the underlying causes of this phenomenon and outlines future directions for improving model reliability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06709v1-abstract-full').style.display = 'none'; document.getElementById('2503.06709v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06592">arXiv:2503.06592</a> <span> [<a href="https://arxiv.org/pdf/2503.06592">pdf</a>, <a href="https://arxiv.org/format/2503.06592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Automated Proof of Polynomial Inequalities via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+B">Banglong Liu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+N">Niuniu Qi</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xia Zeng</a>, <a href="/search/cs?searchtype=author&query=Dehbi%2C+L">Lydia Dehbi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengfeng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06592v1-abstract-short" style="display: inline;"> Polynomial inequality proving is fundamental to many mathematical disciplines and finds wide applications in diverse fields. Current traditional algebraic methods are based on searching for a polynomial positive definite representation over a set of basis. However, these methods are limited by truncation degree. To address this issue, this paper proposes an approach based on reinforcement learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06592v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06592v1-abstract-full" style="display: none;"> Polynomial inequality proving is fundamental to many mathematical disciplines and finds wide applications in diverse fields. Current traditional algebraic methods are based on searching for a polynomial positive definite representation over a set of basis. However, these methods are limited by truncation degree. To address this issue, this paper proposes an approach based on reinforcement learning to find a {Krivine-basis} representation for proving polynomial inequalities. Specifically, we formulate the inequality proving problem as a linear programming (LP) problem and encode it as a basis selection problem using reinforcement learning (RL), achieving a non-negative {Krivine basis}. Moreover, a fast multivariate polynomial multiplication method based on Fast Fourier Transform (FFT) is employed to enhance the efficiency of action space search. Furthermore, we have implemented a tool called {APPIRL} (Automated Proof of Polynomial Inequalities via Reinforcement Learning). Experimental evaluation on benchmark problems demonstrates the feasibility and effectiveness of our approach. In addition, {APPIRL} has been successfully applied to solve the maximum stable set problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06592v1-abstract-full').style.display = 'none'; document.getElementById('2503.06592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06134">arXiv:2503.06134</a> <span> [<a href="https://arxiv.org/pdf/2503.06134">pdf</a>, <a href="https://arxiv.org/format/2503.06134">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> X2I: Seamless Integration of Multimodal Understanding into Diffusion Transformer via Attention Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jian Ma</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Q">Qirong Peng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xu Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haonan Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06134v1-abstract-short" style="display: inline;"> Text-to-image (T2I) models are well known for their ability to produce highly realistic images, while multimodal large language models (MLLMs) are renowned for their proficiency in understanding and integrating multiple modalities. However, currently there is no straightforward and efficient framework to transfer the multimodal comprehension abilities of MLLMs to T2I models to enable them to under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06134v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06134v1-abstract-full" style="display: none;"> Text-to-image (T2I) models are well known for their ability to produce highly realistic images, while multimodal large language models (MLLMs) are renowned for their proficiency in understanding and integrating multiple modalities. However, currently there is no straightforward and efficient framework to transfer the multimodal comprehension abilities of MLLMs to T2I models to enable them to understand multimodal inputs. In this paper, we propose the X2I framework, which endows Diffusion Transformer (DiT) models with the capability to comprehend various modalities, including multilingual text, screenshot documents, images, videos, and audio. X2I is trained using merely 100K English corpus with 160 GPU hours. Building on the DiT teacher model, we adopt an innovative distillation method to extract the inference capabilities of the teacher model and design a lightweight AlignNet structure to serve as an intermediate bridge. Compared to the teacher model, X2I shows a decrease in performance degradation of less than 1\% while gaining various multimodal understanding abilities, including multilingual to image, image to image, image-text to image, video to image, audio to image, and utilizing creative fusion to enhance imagery. Furthermore, it is applicable for LoRA training in the context of image-text to image generation, filling a void in the industry in this area. We further design a simple LightControl to enhance the fidelity of instructional image editing. Finally, extensive experiments demonstrate the effectiveness, efficiency, multifunctionality, and transferability of our X2I. The open-source code and checkpoints for X2I can be found at the following link: https://github.com/OPPO-Mente-Lab/X2I. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06134v1-abstract-full').style.display = 'none'; document.getElementById('2503.06134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/OPPO-Mente-Lab/X2I</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06072">arXiv:2503.06072</a> <span> [<a href="https://arxiv.org/pdf/2503.06072">pdf</a>, <a href="https://arxiv.org/format/2503.06072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Post-training of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tie%2C+G">Guiyao Tie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zeli Zhao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Dingjie Song</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+F">Fuyang Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Rong Zhou</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yurou Dai</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wen Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhejian Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiangyue Yan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yao Su</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zhenhan Dai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifeng Xie</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yihan Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lichao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lifang He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hechang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Q">Qingsong Wen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+N+Z">Neil Zhenqiang Gong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiliang Tang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+H">Heng Ji</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06072v1-abstract-short" style="display: inline;"> The emergence of Large Language Models (LLMs) has fundamentally transformed natural language processing, making them indispensable across domains ranging from conversational systems to scientific exploration. However, their pre-trained architectures often reveal limitations in specialized contexts, including restricted reasoning capacities, ethical uncertainties, and suboptimal domain-specific per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06072v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06072v1-abstract-full" style="display: none;"> The emergence of Large Language Models (LLMs) has fundamentally transformed natural language processing, making them indispensable across domains ranging from conversational systems to scientific exploration. However, their pre-trained architectures often reveal limitations in specialized contexts, including restricted reasoning capacities, ethical uncertainties, and suboptimal domain-specific performance. These challenges necessitate advanced post-training language models (PoLMs) to address these shortcomings, such as OpenAI-o1/o3 and DeepSeek-R1 (collectively known as Large Reasoning Models, or LRMs). This paper presents the first comprehensive survey of PoLMs, systematically tracing their evolution across five core paradigms: Fine-tuning, which enhances task-specific accuracy; Alignment, which ensures alignment with human preferences; Reasoning, which advances multi-step inference despite challenges in reward design; Efficiency, which optimizes resource utilization amidst increasing complexity; and Integration and Adaptation, which extend capabilities across diverse modalities while addressing coherence issues. Charting progress from ChatGPT's foundational alignment strategies to DeepSeek-R1's innovative reasoning advancements, we illustrate how PoLMs leverage datasets to mitigate biases, deepen reasoning capabilities, and enhance domain adaptability. Our contributions include a pioneering synthesis of PoLM evolution, a structured taxonomy categorizing techniques and datasets, and a strategic agenda emphasizing the role of LRMs in improving reasoning proficiency and domain flexibility. As the first survey of its scope, this work consolidates recent PoLM advancements and establishes a rigorous intellectual framework for future research, fostering the development of LLMs that excel in precision, ethical robustness, and versatility across scientific and societal applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06072v1-abstract-full').style.display = 'none'; document.getElementById('2503.06072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">87 pages, 21 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05740">arXiv:2503.05740</a> <span> [<a href="https://arxiv.org/pdf/2503.05740">pdf</a>, <a href="https://arxiv.org/format/2503.05740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ChatWise: AI-Powered Engaging Conversations for Enhancing Senior Cognitive Wellbeing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengbang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhuangdi Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05740v1-abstract-short" style="display: inline;"> Cognitive health in older adults presents a growing challenge. While conversational interventions show feasibility in improving cognitive wellness, human caregiver resources remain overburdened. AI-based methods have shown promise in providing conversational support, yet existing work is limited to implicit strategy while lacking multi-turn support tailored to seniors. We improve prior art with an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05740v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05740v1-abstract-full" style="display: none;"> Cognitive health in older adults presents a growing challenge. While conversational interventions show feasibility in improving cognitive wellness, human caregiver resources remain overburdened. AI-based methods have shown promise in providing conversational support, yet existing work is limited to implicit strategy while lacking multi-turn support tailored to seniors. We improve prior art with an LLM-driven chatbot named ChatWise for older adults. It follows dual-level conversation reasoning at the inference phase to provide engaging companionship. ChatWise thrives in long-turn conversations, in contrast to conventional LLMs that primarily excel in short-turn exchanges. Grounded experiments show that ChatWise significantly enhances simulated users' cognitive and emotional status, including those with Mild Cognitive Impairment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05740v1-abstract-full').style.display = 'none'; document.getElementById('2503.05740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05734">arXiv:2503.05734</a> <span> [<a href="https://arxiv.org/pdf/2503.05734">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Modeling Behavior Change for Multi-model At-Risk Students Early Prediction (extended version) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiabei Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen-Qun Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiannong Cao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a>, <a href="/search/cs?searchtype=author&query=Poon%2C+K+C+F">Kai Cheung Franky Poon</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+D">Daniel Lai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05734v1-abstract-short" style="display: inline;"> In the educational domain, identifying students at risk of dropping out is essential for allowing educators to intervene effectively, improving both academic outcomes and overall student well-being. Data in educational settings often originate from diverse sources, such as assignments, grades, and attendance records. However, most existing research relies on online learning data and just extractin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05734v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05734v1-abstract-full" style="display: none;"> In the educational domain, identifying students at risk of dropping out is essential for allowing educators to intervene effectively, improving both academic outcomes and overall student well-being. Data in educational settings often originate from diverse sources, such as assignments, grades, and attendance records. However, most existing research relies on online learning data and just extracting the quantitative features. While quantification eases processing, it also leads to a significant loss of original information. Moreover, current models primarily identify students with consistently poor performance through simple and discrete behavioural patterns, failing to capture the complex continuity and non-linear changes in student behaviour. We have developed an innovative prediction model, Multimodal- ChangePoint Detection (MCPD), utilizing the textual teacher remark data and numerical grade data from middle schools. Our model achieves a highly integrated and intelligent analysis by using independent encoders to process two data types, fusing the encoded feature. The model further refines its analysis by leveraging a changepoint detection module to pinpoint crucial behavioral changes, which are integrated as dynamic weights through a simple attention mechanism. Experimental validations indicate that our model achieves an accuracy range of 70- 75%, with an average outperforming baseline algorithms by approximately 5-10%. Additionally, our algorithm demonstrates a certain degree of transferability, maintaining high accuracy when adjusted and retrained with different definitions of at-risk, proving its broad applicability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05734v1-abstract-full').style.display = 'none'; document.getElementById('2503.05734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05161">arXiv:2503.05161</a> <span> [<a href="https://arxiv.org/pdf/2503.05161">pdf</a>, <a href="https://arxiv.org/format/2503.05161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> GaussianCAD: Robust Self-Supervised CAD Reconstruction from Three Orthographic Views Using 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bo Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lina Hu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+L">Liang Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zijian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoli Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+N">Ning Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yonghao Dang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05161v1-abstract-short" style="display: inline;"> The automatic reconstruction of 3D computer-aided design (CAD) models from CAD sketches has recently gained significant attention in the computer vision community. Most existing methods, however, rely on vector CAD sketches and 3D ground truth for supervision, which are often difficult to be obtained in industrial applications and are sensitive to noise inputs. We propose viewing CAD reconstructio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05161v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05161v1-abstract-full" style="display: none;"> The automatic reconstruction of 3D computer-aided design (CAD) models from CAD sketches has recently gained significant attention in the computer vision community. Most existing methods, however, rely on vector CAD sketches and 3D ground truth for supervision, which are often difficult to be obtained in industrial applications and are sensitive to noise inputs. We propose viewing CAD reconstruction as a specific instance of sparse-view 3D reconstruction to overcome these limitations. While this reformulation offers a promising perspective, existing 3D reconstruction methods typically require natural images and corresponding camera poses as inputs, which introduces two major significant challenges: (1) modality discrepancy between CAD sketches and natural images, and (2) difficulty of accurate camera pose estimation for CAD sketches. To solve these issues, we first transform the CAD sketches into representations resembling natural images and extract corresponding masks. Next, we manually calculate the camera poses for the orthographic views to ensure accurate alignment within the 3D coordinate system. Finally, we employ a customized sparse-view 3D reconstruction method to achieve high-quality reconstructions from aligned orthographic views. By leveraging raster CAD sketches for self-supervision, our approach eliminates the reliance on vector CAD sketches and 3D ground truth. Experiments on the Sub-Fusion360 dataset demonstrate that our proposed method significantly outperforms previous approaches in CAD reconstruction performance and exhibits strong robustness to noisy inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05161v1-abstract-full').style.display = 'none'; document.getElementById('2503.05161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05031">arXiv:2503.05031</a> <span> [<a href="https://arxiv.org/pdf/2503.05031">pdf</a>, <a href="https://arxiv.org/format/2503.05031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Alzheimer's Diagnosis: Leveraging Anatomical Landmarks in Graph Convolutional Neural Networks on Tetrahedral Meshes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanxi Chen</a>, <a href="/search/cs?searchtype=author&query=Farazi%2C+M">Mohammad Farazi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhangsihao Yang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yonghui Fan</a>, <a href="/search/cs?searchtype=author&query=Ashton%2C+N">Nicholas Ashton</a>, <a href="/search/cs?searchtype=author&query=Reiman%2C+E+M">Eric M Reiman</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05031v1-abstract-short" style="display: inline;"> Alzheimer's disease (AD) is a major neurodegenerative condition that affects millions around the world. As one of the main biomarkers in the AD diagnosis procedure, brain amyloid positivity is typically identified by positron emission tomography (PET), which is costly and invasive. Brain structural magnetic resonance imaging (sMRI) may provide a safer and more convenient solution for the AD diagno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05031v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05031v1-abstract-full" style="display: none;"> Alzheimer's disease (AD) is a major neurodegenerative condition that affects millions around the world. As one of the main biomarkers in the AD diagnosis procedure, brain amyloid positivity is typically identified by positron emission tomography (PET), which is costly and invasive. Brain structural magnetic resonance imaging (sMRI) may provide a safer and more convenient solution for the AD diagnosis. Recent advances in geometric deep learning have facilitated sMRI analysis and early diagnosis of AD. However, determining AD pathology, such as brain amyloid deposition, in preclinical stage remains challenging, as less significant morphological changes can be observed. As a result, few AD classification models are generalizable to the brain amyloid positivity classification task. Blood-based biomarkers (BBBMs), on the other hand, have recently achieved remarkable success in predicting brain amyloid positivity and identifying individuals with high risk of being brain amyloid positive. However, individuals in medium risk group still require gold standard tests such as Amyloid PET for further evaluation. Inspired by the recent success of transformer architectures, we propose a geometric deep learning model based on transformer that is both scalable and robust to variations in input volumetric mesh size. Our work introduced a novel tokenization scheme for tetrahedral meshes, incorporating anatomical landmarks generated by a pre-trained Gaussian process model. Our model achieved superior classification performance in AD classification task. In addition, we showed that the model was also generalizable to the brain amyloid positivity prediction with individuals in the medium risk class, where BM alone cannot achieve a clear classification. Our work may enrich geometric deep learning research and improve AD diagnosis accuracy without using expensive and invasive PET scans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05031v1-abstract-full').style.display = 'none'; document.getElementById('2503.05031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04837">arXiv:2503.04837</a> <span> [<a href="https://arxiv.org/pdf/2503.04837">pdf</a>, <a href="https://arxiv.org/format/2503.04837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FedPalm: A General Federated Learning Framework for Closed- and Open-Set Palmprint Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chengrui Gao</a>, <a href="/search/cs?searchtype=author&query=Teoh%2C+A+B+J">Andrew Beng Jin Teoh</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bob Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04837v1-abstract-short" style="display: inline;"> Current deep learning (DL)-based palmprint verification models rely on centralized training with large datasets, which raises significant privacy concerns due to biometric data's sensitive and immutable nature. Federated learning~(FL), a privacy-preserving distributed learning paradigm, offers a compelling alternative by enabling collaborative model training without the need for data sharing. Howe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04837v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04837v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04837v1-abstract-full" style="display: none;"> Current deep learning (DL)-based palmprint verification models rely on centralized training with large datasets, which raises significant privacy concerns due to biometric data's sensitive and immutable nature. Federated learning~(FL), a privacy-preserving distributed learning paradigm, offers a compelling alternative by enabling collaborative model training without the need for data sharing. However, FL-based palmprint verification faces critical challenges, including data heterogeneity from diverse identities and the absence of standardized evaluation benchmarks. This paper addresses these gaps by establishing a comprehensive benchmark for FL-based palmprint verification, which explicitly defines and evaluates two practical scenarios: closed-set and open-set verification. We propose FedPalm, a unified FL framework that balances local adaptability with global generalization. Each client trains a personalized textural expert tailored to local data and collaboratively contributes to a shared global textural expert for extracting generalized features. To further enhance verification performance, we introduce a Textural Expert Interaction Module that dynamically routes textural features among experts to generate refined side textural features. Learnable parameters are employed to model relationships between original and side features, fostering cross-texture-expert interaction and improving feature discrimination. Extensive experiments validate the effectiveness of FedPalm, demonstrating robust performance across both scenarios and providing a promising foundation for advancing FL-based palmprint verification research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04837v1-abstract-full').style.display = 'none'; document.getElementById('2503.04837v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04639">arXiv:2503.04639</a> <span> [<a href="https://arxiv.org/pdf/2503.04639">pdf</a>, <a href="https://arxiv.org/format/2503.04639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing SAM with Efficient Prompting and Preference Optimization for Semi-supervised Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Konwer%2C+A">Aishik Konwer</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhijian Yang</a>, <a href="/search/cs?searchtype=author&query=Bas%2C+E">Erhan Bas</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+C">Cao Xiao</a>, <a href="/search/cs?searchtype=author&query=Prasanna%2C+P">Prateek Prasanna</a>, <a href="/search/cs?searchtype=author&query=Bhatia%2C+P">Parminder Bhatia</a>, <a href="/search/cs?searchtype=author&query=Kass-Hout%2C+T">Taha Kass-Hout</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04639v1-abstract-short" style="display: inline;"> Foundational models such as the Segment Anything Model (SAM) are gaining traction in medical imaging segmentation, supporting multiple downstream tasks. However, such models are supervised in nature, still relying on large annotated datasets or prompts supplied by experts. Conventional techniques such as active learning to alleviate such limitations are limited in scope and still necessitate conti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04639v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04639v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04639v1-abstract-full" style="display: none;"> Foundational models such as the Segment Anything Model (SAM) are gaining traction in medical imaging segmentation, supporting multiple downstream tasks. However, such models are supervised in nature, still relying on large annotated datasets or prompts supplied by experts. Conventional techniques such as active learning to alleviate such limitations are limited in scope and still necessitate continuous human involvement and complex domain knowledge for label refinement or establishing reward ground truth. To address these challenges, we propose an enhanced Segment Anything Model (SAM) framework that utilizes annotation-efficient prompts generated in a fully unsupervised fashion, while still capturing essential semantic, location, and shape information through contrastive language-image pretraining and visual question answering. We adopt the direct preference optimization technique to design an optimal policy that enables the model to generate high-fidelity segmentations with simple ratings or rankings provided by a virtual annotator simulating the human annotation process. State-of-the-art performance of our framework in tasks such as lung segmentation, breast tumor segmentation, and organ segmentation across various modalities, including X-ray, ultrasound, and abdominal CT, justifies its effectiveness in low-annotation data scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04639v1-abstract-full').style.display = 'none'; document.getElementById('2503.04639v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04332">arXiv:2503.04332</a> <span> [<a href="https://arxiv.org/pdf/2503.04332">pdf</a>, <a href="https://arxiv.org/format/2503.04332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Challenge of Identifying the Origin of Black-Box Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziqing Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixin Wu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yun Shen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+W">Wei Dai</a>, <a href="/search/cs?searchtype=author&query=Backes%2C+M">Michael Backes</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04332v1-abstract-short" style="display: inline;"> The tremendous commercial potential of large language models (LLMs) has heightened concerns about their unauthorized use. Third parties can customize LLMs through fine-tuning and offer only black-box API access, effectively concealing unauthorized usage and complicating external auditing processes. This practice not only exacerbates unfair competition, but also violates licensing agreements. In re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04332v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04332v1-abstract-full" style="display: none;"> The tremendous commercial potential of large language models (LLMs) has heightened concerns about their unauthorized use. Third parties can customize LLMs through fine-tuning and offer only black-box API access, effectively concealing unauthorized usage and complicating external auditing processes. This practice not only exacerbates unfair competition, but also violates licensing agreements. In response, identifying the origin of black-box LLMs is an intrinsic solution to this issue. In this paper, we first reveal the limitations of state-of-the-art passive and proactive identification methods with experiments on 30 LLMs and two real-world black-box APIs. Then, we propose the proactive technique, PlugAE, which optimizes adversarial token embeddings in a continuous space and proactively plugs them into the LLM for tracing and identification. The experiments show that PlugAE can achieve substantial improvement in identifying fine-tuned derivatives. We further advocate for legal frameworks and regulations to better address the challenges posed by the unauthorized use of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04332v1-abstract-full').style.display = 'none'; document.getElementById('2503.04332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04240">arXiv:2503.04240</a> <span> [<a href="https://arxiv.org/pdf/2503.04240">pdf</a>, <a href="https://arxiv.org/format/2503.04240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DiffPO: Diffusion-styled Preference Optimization for Efficient Inference-Time Alignment of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruizhe Chen</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhifei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaotian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+T">Joey Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Quek%2C+T">Tony Quek</a>, <a href="/search/cs?searchtype=author&query=Poria%2C+S">Soujanya Poria</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuozhu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04240v2-abstract-short" style="display: inline;"> Inference-time alignment provides an efficient alternative for aligning LLMs with humans. However, these approaches still face challenges, such as limited scalability due to policy-specific value functions and latency during the inference phase. In this paper, we propose a novel approach, Diffusion-styled Preference Optimization (\model), which provides an efficient and policy-agnostic solution fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04240v2-abstract-full').style.display = 'inline'; document.getElementById('2503.04240v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04240v2-abstract-full" style="display: none;"> Inference-time alignment provides an efficient alternative for aligning LLMs with humans. However, these approaches still face challenges, such as limited scalability due to policy-specific value functions and latency during the inference phase. In this paper, we propose a novel approach, Diffusion-styled Preference Optimization (\model), which provides an efficient and policy-agnostic solution for aligning LLMs with humans. By directly performing alignment at sentence level, \model~avoids the time latency associated with token-level generation. Designed as a plug-and-play module, \model~can be seamlessly integrated with various base models to enhance their alignment. Extensive experiments on AlpacaEval 2, MT-bench, and HH-RLHF demonstrate that \model~achieves superior alignment performance across various settings, achieving a favorable trade-off between alignment quality and inference-time latency. Furthermore, \model~demonstrates model-agnostic scalability, significantly improving the performance of large models such as Llama-3-70B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04240v2-abstract-full').style.display = 'none'; document.getElementById('2503.04240v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04222">arXiv:2503.04222</a> <span> [<a href="https://arxiv.org/pdf/2503.04222">pdf</a>, <a href="https://arxiv.org/format/2503.04222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FuseChat-3.0: Preference Optimization Meets Heterogeneous Model Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+F">Fanqi Wan</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+L">Longguang Zhong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Canbin Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+G">Guosheng Liang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+X">Xiaojun Quan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04222v1-abstract-short" style="display: inline;"> We introduce FuseChat-3.0, a suite of large language models (LLMs) developed by integrating the strengths of heterogeneous source LLMs into more compact target LLMs. Our source models include the powerful Gemma-2-27B-it, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For target models, we focus on three widely-used smaller variants-Llama-3.1-8B-Instruct, Gemma-2-9B… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04222v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04222v1-abstract-full" style="display: none;"> We introduce FuseChat-3.0, a suite of large language models (LLMs) developed by integrating the strengths of heterogeneous source LLMs into more compact target LLMs. Our source models include the powerful Gemma-2-27B-it, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For target models, we focus on three widely-used smaller variants-Llama-3.1-8B-Instruct, Gemma-2-9B-it, and Qwen-2.5-7B-Instruct-along with two ultra-compact options, Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. To leverage the diverse capabilities of these source models, we develop a specialized data construction protocol tailored to various tasks and domains. The FuseChat-3.0 training pipeline consists of two key stages: (1) supervised fine-tuning (SFT) to align the target and source model distributions, and (2) Direct Preference Optimization (DPO) to apply preferences from multiple source LLMs to fine-tune the target model. The resulting FuseChat-3.0 models exhibit significant performance gains across tasks such as instruction following, general knowledge, mathematics, and coding. As illustrated in Figure 1, using Llama-3.1-8B-Instruct as the target model, our fusion approach achieves an average improvement of 6.8 points across 14 benchmarks. Moreover, it demonstrates remarkable gains of 37.1 points and 30.1 points on the instruction-following benchmarks AlpacaEval-2 and Arena-Hard, respectively. Our code, models, and datasets are available at https://github.com/SLIT-AI/FuseChat-3.0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04222v1-abstract-full').style.display = 'none'; document.getElementById('2503.04222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04184">arXiv:2503.04184</a> <span> [<a href="https://arxiv.org/pdf/2503.04184">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large-Scale AI in Telecom: Charting the Roadmap for Innovation, Scalability, and Enhanced Digital Experiences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shahid%2C+A">Adnan Shahid</a>, <a href="/search/cs?searchtype=author&query=Kliks%2C+A">Adrian Kliks</a>, <a href="/search/cs?searchtype=author&query=Al-Tahmeesschi%2C+A">Ahmed Al-Tahmeesschi</a>, <a href="/search/cs?searchtype=author&query=Elbakary%2C+A">Ahmed Elbakary</a>, <a href="/search/cs?searchtype=author&query=Nikou%2C+A">Alexandros Nikou</a>, <a href="/search/cs?searchtype=author&query=Maatouk%2C+A">Ali Maatouk</a>, <a href="/search/cs?searchtype=author&query=Mokh%2C+A">Ali Mokh</a>, <a href="/search/cs?searchtype=author&query=Kazemi%2C+A">Amirreza Kazemi</a>, <a href="/search/cs?searchtype=author&query=De+Domenico%2C+A">Antonio De Domenico</a>, <a href="/search/cs?searchtype=author&query=Karapantelakis%2C+A">Athanasios Karapantelakis</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bo Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bohao Wang</a>, <a href="/search/cs?searchtype=author&query=Fischione%2C+C">Carlo Fischione</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Issaid%2C+C+B">Chaouki Ben Issaid</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+C">Chenghui Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chongwen Huang</a>, <a href="/search/cs?searchtype=author&query=Chaccour%2C+C">Christina Chaccour</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+C+K">Christo Kurisummoottil Thomas</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+D">Dheeraj Sharma</a>, <a href="/search/cs?searchtype=author&query=Kalogiros%2C+D">Dimitris Kalogiros</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/cs?searchtype=author&query=De+Poorter%2C+E">Eli De Poorter</a> , et al. (110 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04184v1-abstract-short" style="display: inline;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04184v1-abstract-full" style="display: none;"> This white paper discusses the role of large-scale AI in the telecommunications industry, with a specific focus on the potential of generative AI to revolutionize network functions and user experiences, especially in the context of 6G systems. It highlights the development and deployment of Large Telecom Models (LTMs), which are tailored AI models designed to address the complex challenges faced by modern telecom networks. The paper covers a wide range of topics, from the architecture and deployment strategies of LTMs to their applications in network management, resource allocation, and optimization. It also explores the regulatory, ethical, and standardization considerations for LTMs, offering insights into their future integration into telecom infrastructure. The goal is to provide a comprehensive roadmap for the adoption of LTMs to enhance scalability, performance, and user-centric innovation in telecom networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04184v1-abstract-full').style.display = 'none'; document.getElementById('2503.04184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04155">arXiv:2503.04155</a> <span> [<a href="https://arxiv.org/pdf/2503.04155">pdf</a>, <a href="https://arxiv.org/format/2503.04155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BPQA Dataset: Evaluating How Well Language Models Leverage Blood Pressures to Answer Biomedical Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hang%2C+C">Chi Hang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruiqi Deng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L+Y">Lavender Yao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zihao Yang</a>, <a href="/search/cs?searchtype=author&query=Alyakin%2C+A">Anton Alyakin</a>, <a href="/search/cs?searchtype=author&query=Alber%2C+D">Daniel Alber</a>, <a href="/search/cs?searchtype=author&query=Oermann%2C+E+K">Eric Karl Oermann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04155v1-abstract-short" style="display: inline;"> Clinical measurements such as blood pressures and respiration rates are critical in diagnosing and monitoring patient outcomes. It is an important component of biomedical data, which can be used to train transformer-based language models (LMs) for improving healthcare delivery. It is, however, unclear whether LMs can effectively interpret and use clinical measurements. We investigate two questions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04155v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04155v1-abstract-full" style="display: none;"> Clinical measurements such as blood pressures and respiration rates are critical in diagnosing and monitoring patient outcomes. It is an important component of biomedical data, which can be used to train transformer-based language models (LMs) for improving healthcare delivery. It is, however, unclear whether LMs can effectively interpret and use clinical measurements. We investigate two questions: First, can LMs effectively leverage clinical measurements to answer related medical questions? Second, how to enhance an LM's performance on medical question-answering (QA) tasks that involve measurements? We performed a case study on blood pressure readings (BPs), a vital sign routinely monitored by medical professionals. We evaluated the performance of four LMs: BERT, BioBERT, MedAlpaca, and GPT-3.5, on our newly developed dataset, BPQA (Blood Pressure Question Answering). BPQA contains $100$ medical QA pairs that were verified by medical students and designed to rely on BPs . We found that GPT-3.5 and MedAlpaca (larger and medium sized LMs) benefit more from the inclusion of BPs than BERT and BioBERT (small sized LMs). Further, augmenting measurements with labels improves the performance of BioBERT and Medalpaca (domain specific LMs), suggesting that retrieval may be useful for improving domain-specific LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04155v1-abstract-full').style.display = 'none'; document.getElementById('2503.04155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03882">arXiv:2503.03882</a> <span> [<a href="https://arxiv.org/pdf/2503.03882">pdf</a>, <a href="https://arxiv.org/format/2503.03882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IC-Mapper: Instance-Centric Spatio-Temporal Modeling for Online Vectorized Map Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiangtong Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhao Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yinan Shi</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jianwu Fang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jianru Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03882v1-abstract-short" style="display: inline;"> Online vector map construction based on visual data can bypass the processes of data collection, post-processing, and manual annotation required by traditional map construction, which significantly enhances map-building efficiency. However, existing work treats the online mapping task as a local range perception task, overlooking the spatial scalability required for map construction. We propose IC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03882v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03882v1-abstract-full" style="display: none;"> Online vector map construction based on visual data can bypass the processes of data collection, post-processing, and manual annotation required by traditional map construction, which significantly enhances map-building efficiency. However, existing work treats the online mapping task as a local range perception task, overlooking the spatial scalability required for map construction. We propose IC-Mapper, an instance-centric online mapping framework, which comprises two primary components: 1) Instance-centric temporal association module: For the detection queries of adjacent frames, we measure them in both feature and geometric dimensions to obtain the matching correspondence between instances across frames. 2) Instance-centric spatial fusion module: We perform point sampling on the historical global map from a spatial dimension and integrate it with the detection results of instances corresponding to the current frame to achieve real-time expansion and update of the map. Based on the nuScenes dataset, we evaluate our approach on detection, tracking, and global mapping metrics. Experimental results demonstrate the superiority of IC-Mapper against other state-of-the-art methods. Code will be released on https://github.com/Brickzhuantou/IC-Mapper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03882v1-abstract-full').style.display = 'none'; document.getElementById('2503.03882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03689">arXiv:2503.03689</a> <span> [<a href="https://arxiv.org/pdf/2503.03689">pdf</a>, <a href="https://arxiv.org/format/2503.03689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DualDiff+: Dual-Branch Diffusion for High-Fidelity Video Generation with Reward Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhao Yang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zezhong Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaofan Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weixiang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Gongpeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ruohong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingsi Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Longjun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03689v1-abstract-short" style="display: inline;"> Accurate and high-fidelity driving scene reconstruction demands the effective utilization of comprehensive scene information as conditional inputs. Existing methods predominantly rely on 3D bounding boxes and BEV road maps for foreground and background control, which fail to capture the full complexity of driving scenes and adequately integrate multimodal information. In this work, we present Dual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03689v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03689v1-abstract-full" style="display: none;"> Accurate and high-fidelity driving scene reconstruction demands the effective utilization of comprehensive scene information as conditional inputs. Existing methods predominantly rely on 3D bounding boxes and BEV road maps for foreground and background control, which fail to capture the full complexity of driving scenes and adequately integrate multimodal information. In this work, we present DualDiff, a dual-branch conditional diffusion model designed to enhance driving scene generation across multiple views and video sequences. Specifically, we introduce Occupancy Ray-shape Sampling (ORS) as a conditional input, offering rich foreground and background semantics alongside 3D spatial geometry to precisely control the generation of both elements. To improve the synthesis of fine-grained foreground objects, particularly complex and distant ones, we propose a Foreground-Aware Mask (FGM) denoising loss function. Additionally, we develop the Semantic Fusion Attention (SFA) mechanism to dynamically prioritize relevant information and suppress noise, enabling more effective multimodal fusion. Finally, to ensure high-quality image-to-video generation, we introduce the Reward-Guided Diffusion (RGD) framework, which maintains global consistency and semantic coherence in generated videos. Extensive experiments demonstrate that DualDiff achieves state-of-the-art (SOTA) performance across multiple datasets. On the NuScenes dataset, DualDiff reduces the FID score by 4.09% compared to the best baseline. In downstream tasks, such as BEV segmentation, our method improves vehicle mIoU by 4.50% and road mIoU by 1.70%, while in BEV 3D object detection, the foreground mAP increases by 1.46%. Code will be made available at https://github.com/yangzhaojason/DualDiff. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03689v1-abstract-full').style.display = 'none'; document.getElementById('2503.03689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02913">arXiv:2503.02913</a> <span> [<a href="https://arxiv.org/pdf/2503.02913">pdf</a>, <a href="https://arxiv.org/format/2503.02913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Multi-UAV Collaboration: MARL with Noise-Resilient Communication and Attention Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chishui Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haotian Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiale Chen</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xuanlin Yue</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhejian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02913v1-abstract-short" style="display: inline;"> Efficient path planning for unmanned aerial vehicles (UAVs) is crucial in remote sensing and information collection. As task scales expand, the cooperative deployment of multiple UAVs significantly improves information collection efficiency. However, collaborative communication and decision-making for multiple UAVs remain major challenges in path planning, especially in noisy environments. To effi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02913v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02913v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02913v1-abstract-full" style="display: none;"> Efficient path planning for unmanned aerial vehicles (UAVs) is crucial in remote sensing and information collection. As task scales expand, the cooperative deployment of multiple UAVs significantly improves information collection efficiency. However, collaborative communication and decision-making for multiple UAVs remain major challenges in path planning, especially in noisy environments. To efficiently accomplish complex information collection tasks in 3D space and address robust communication issues, we propose a multi-agent reinforcement learning (MARL) framework for UAV path planning based on the Counterfactual Multi-Agent Policy Gradients (COMA) algorithm. The framework incorporates attention mechanism-based UAV communication protocol and training-deployment system, significantly improving communication robustness and individual decision-making capabilities in noisy conditions. Experiments conducted on both synthetic and real-world datasets demonstrate that our method outperforms existing algorithms in terms of path planning efficiency and robustness, especially in noisy environments, achieving a 78\% improvement in entropy reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02913v1-abstract-full').style.display = 'none'; document.getElementById('2503.02913v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02624">arXiv:2503.02624</a> <span> [<a href="https://arxiv.org/pdf/2503.02624">pdf</a>, <a href="https://arxiv.org/format/2503.02624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Human-aligned Safe Reinforcement Learning for Highway On-Ramp Merging in Dense Traffic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shijie Yuan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yuan Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaolong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qisong Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+H">Hongmao Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02624v1-abstract-short" style="display: inline;"> Most reinforcement learning (RL) approaches for the decision-making of autonomous driving consider safety as a reward instead of a cost, which makes it hard to balance the tradeoff between safety and other objectives. Human risk preference has also rarely been incorporated, and the trained policy might be either conservative or aggressive for users. To this end, this study proposes a human-aligned… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02624v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02624v1-abstract-full" style="display: none;"> Most reinforcement learning (RL) approaches for the decision-making of autonomous driving consider safety as a reward instead of a cost, which makes it hard to balance the tradeoff between safety and other objectives. Human risk preference has also rarely been incorporated, and the trained policy might be either conservative or aggressive for users. To this end, this study proposes a human-aligned safe RL approach for autonomous merging, in which the high-level decision problem is formulated as a constrained Markov decision process (CMDP) that incorporates users' risk preference into the safety constraints, followed by a model predictive control (MPC)-based low-level control. The safety level of RL policy can be adjusted by computing cost limits of CMDP's constraints based on risk preferences and traffic density using a fuzzy control method. To filter out unsafe or invalid actions, we design an action shielding mechanism that pre-executes RL actions using an MPC method and performs collision checks with surrounding agents. We also provide theoretical proof to validate the effectiveness of the shielding mechanism in enhancing RL's safety and sample efficiency. Simulation experiments in multiple levels of traffic densities show that our method can significantly reduce safety violations without sacrificing traffic efficiency. Furthermore, due to the use of risk preference-aware constraints in CMDP and action shielding, we can not only adjust the safety level of the final policy but also reduce safety violations during the training stage, proving a promising solution for online learning in real-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02624v1-abstract-full').style.display = 'none'; document.getElementById('2503.02624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02537">arXiv:2503.02537</a> <span> [<a href="https://arxiv.org/pdf/2503.02537">pdf</a>, <a href="https://arxiv.org/format/2503.02537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RectifiedHR: Enable Efficient High-Resolution Image Generation via Energy Rectification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+G">Guibao Shen</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Liang Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mushui Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Luozhou Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+X">Xin Tao</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Di Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02537v2-abstract-short" style="display: inline;"> Diffusion models have achieved remarkable advances in various image generation tasks. However, their performance notably declines when generating images at resolutions higher than those used during the training period. Despite the existence of numerous methods for producing high-resolution images, they either suffer from inefficiency or are hindered by complex operations. In this paper, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02537v2-abstract-full').style.display = 'inline'; document.getElementById('2503.02537v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02537v2-abstract-full" style="display: none;"> Diffusion models have achieved remarkable advances in various image generation tasks. However, their performance notably declines when generating images at resolutions higher than those used during the training period. Despite the existence of numerous methods for producing high-resolution images, they either suffer from inefficiency or are hindered by complex operations. In this paper, we propose RectifiedHR, an straightforward and efficient solution for training-free high-resolution image generation. Specifically, we introduce the noise refresh strategy, which theoretically only requires a few lines of code to unlock the model's high-resolution generation ability and improve efficiency. Additionally, we first observe the phenomenon of energy decay that may cause image blurriness during the high-resolution image generation process. To address this issue, we introduce average latent energy analysis and discover that an improved classifier-free guidance hyperparameter can significantly enhance generation performance. Our method is entirely training-free and boasts a simple implementation logic and efficient performance. Through extensive comparisons with numerous baseline methods, our RectifiedHR demonstrates superior effectiveness and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02537v2-abstract-full').style.display = 'none'; document.getElementById('2503.02537v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://zhenyangcs.github.io/RectifiedHR-Diffusion/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02206">arXiv:2503.02206</a> <span> [<a href="https://arxiv.org/pdf/2503.02206">pdf</a>, <a href="https://arxiv.org/format/2503.02206">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Language-Guided Visual Perception Disentanglement for Image Quality Assessment and Conditional Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhichao Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Leida Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengfei Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinjian Wu</a>, <a href="/search/cs?searchtype=author&query=Valenzise%2C+G">Giuseppe Valenzise</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02206v1-abstract-short" style="display: inline;"> Contrastive vision-language models, such as CLIP, have demonstrated excellent zero-shot capability across semantic recognition tasks, mainly attributed to the training on a large-scale I&1T (one Image with one Text) dataset. This kind of multimodal representations often blend semantic and perceptual elements, placing a particular emphasis on semantics. However, this could be problematic for popula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02206v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02206v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02206v1-abstract-full" style="display: none;"> Contrastive vision-language models, such as CLIP, have demonstrated excellent zero-shot capability across semantic recognition tasks, mainly attributed to the training on a large-scale I&1T (one Image with one Text) dataset. This kind of multimodal representations often blend semantic and perceptual elements, placing a particular emphasis on semantics. However, this could be problematic for popular tasks like image quality assessment (IQA) and conditional image generation (CIG), which typically need to have fine control on perceptual and semantic features. Motivated by the above facts, this paper presents a new multimodal disentangled representation learning framework, which leverages disentangled text to guide image disentanglement. To this end, we first build an I&2T (one Image with a perceptual Text and a semantic Text) dataset, which consists of disentangled perceptual and semantic text descriptions for an image. Then, the disentangled text descriptions are utilized as supervisory signals to disentangle pure perceptual representations from CLIP's original `coarse' feature space, dubbed DeCLIP. Finally, the decoupled feature representations are used for both image quality assessment (technical quality and aesthetic quality) and conditional image generation. Extensive experiments and comparisons have demonstrated the advantages of the proposed method on the two popular tasks. The dataset, code, and model will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02206v1-abstract-full').style.display = 'none'; document.getElementById('2503.02206v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01903">arXiv:2503.01903</a> <span> [<a href="https://arxiv.org/pdf/2503.01903">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> PsychBench: A comprehensive and professional benchmark for evaluating the performance of LLM-assisted psychiatric clinical practice </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruoxi Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Ling Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xuequan Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Rui Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinzhu Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhi Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01903v1-abstract-short" style="display: inline;"> The advent of Large Language Models (LLMs) offers potential solutions to address problems such as shortage of medical resources and low diagnostic consistency in psychiatric clinical practice. Despite this potential, a robust and comprehensive benchmarking framework to assess the efficacy of LLMs in authentic psychiatric clinical environments is absent. This has impeded the advancement of speciali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01903v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01903v1-abstract-full" style="display: none;"> The advent of Large Language Models (LLMs) offers potential solutions to address problems such as shortage of medical resources and low diagnostic consistency in psychiatric clinical practice. Despite this potential, a robust and comprehensive benchmarking framework to assess the efficacy of LLMs in authentic psychiatric clinical environments is absent. This has impeded the advancement of specialized LLMs tailored to psychiatric applications. In response to this gap, by incorporating clinical demands in psychiatry and clinical data, we proposed a benchmarking system, PsychBench, to evaluate the practical performance of LLMs in psychiatric clinical settings. We conducted a comprehensive quantitative evaluation of 16 LLMs using PsychBench, and investigated the impact of prompt design, chain-of-thought reasoning, input text length, and domain-specific knowledge fine-tuning on model performance. Through detailed error analysis, we identified strengths and potential limitations of the existing models and suggested directions for improvement. Subsequently, a clinical reader study involving 60 psychiatrists of varying seniority was conducted to further explore the practical benefits of existing LLMs as supportive tools for psychiatrists of varying seniority. Through the quantitative and reader evaluation, we show that while existing models demonstrate significant potential, they are not yet adequate as decision-making tools in psychiatric clinical practice. The reader study further indicates that, as an auxiliary tool, LLM could provide particularly notable support for junior psychiatrists, effectively enhancing their work efficiency and overall clinical quality. To promote research in this area, we will make the dataset and evaluation framework publicly available, with the hope of advancing the application of LLMs in psychiatric clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01903v1-abstract-full').style.display = 'none'; document.getElementById('2503.01903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01874">arXiv:2503.01874</a> <span> [<a href="https://arxiv.org/pdf/2503.01874">pdf</a>, <a href="https://arxiv.org/format/2503.01874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CABS: Conflict-Aware and Balanced Sparsification for Enhancing Model Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongzhen Yang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+B">Binhang Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hailong Sun</a>, <a href="/search/cs?searchtype=author&query=Long%2C+W">Wenrui Long</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruobing Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01874v1-abstract-short" style="display: inline;"> Model merging based on task vectors, i.e., the parameter differences between fine-tuned models and a shared base model, provides an efficient way to integrate multiple task-specific models into a multitask model without retraining. Recent works have endeavored to address the conflicts between task vectors, one of the significant challenges faced by model merging, through sparsification; however, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01874v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01874v1-abstract-full" style="display: none;"> Model merging based on task vectors, i.e., the parameter differences between fine-tuned models and a shared base model, provides an efficient way to integrate multiple task-specific models into a multitask model without retraining. Recent works have endeavored to address the conflicts between task vectors, one of the significant challenges faced by model merging, through sparsification; however, two issues significantly limit their performance: high parameter overlap and unbalanced weight distribution. To address these issues, we propose a simple, yet effective framework called CABS (Conflict-Aware and Balanced Sparsification), consisting of Conflict-Aware Sparsification (CA) and Balanced Sparsification (BS). CA can reduce parameter overlap by applying masks during sequential pruning, ensuring that each task vector retains distinct, non-overlapping parameters. BS leverages $n$: $m$ pruning to preserve critical weights while maintaining an even distribution across layers. Our comprehensive experiments demonstrate that CABS outperforms state-of-the-art methods across diverse tasks and model sizes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01874v1-abstract-full').style.display = 'none'; document.getElementById('2503.01874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01743">arXiv:2503.01743</a> <span> [<a href="https://arxiv.org/pdf/2503.01743">pdf</a>, <a href="https://arxiv.org/format/2503.01743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Microsoft"> Microsoft</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Abouelenin%2C+A">Abdelrahman Abouelenin</a>, <a href="/search/cs?searchtype=author&query=Ashfaq%2C+A">Atabak Ashfaq</a>, <a href="/search/cs?searchtype=author&query=Atkinson%2C+A">Adam Atkinson</a>, <a href="/search/cs?searchtype=author&query=Awadalla%2C+H">Hany Awadalla</a>, <a href="/search/cs?searchtype=author&query=Bach%2C+N">Nguyen Bach</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+J">Jianmin Bao</a>, <a href="/search/cs?searchtype=author&query=Benhaim%2C+A">Alon Benhaim</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+M">Martin Cai</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+V">Vishrav Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Congcong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junkun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weizhu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yen-Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi-ling Chen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiyang Dai</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Ruchao Fan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mei Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Min Gao</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+A">Amit Garg</a>, <a href="/search/cs?searchtype=author&query=Goswami%2C+A">Abhishek Goswami</a> , et al. (51 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01743v2-abstract-short" style="display: inline;"> We introduce Phi-4-Mini and Phi-4-Multimodal, compact yet highly capable language and multimodal models. Phi-4-Mini is a 3.8-billion-parameter language model trained on high-quality web and synthetic data, significantly outperforming recent open-source models of similar size and matching the performance of models twice its size on math and coding tasks requiring complex reasoning. This achievement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01743v2-abstract-full').style.display = 'inline'; document.getElementById('2503.01743v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01743v2-abstract-full" style="display: none;"> We introduce Phi-4-Mini and Phi-4-Multimodal, compact yet highly capable language and multimodal models. Phi-4-Mini is a 3.8-billion-parameter language model trained on high-quality web and synthetic data, significantly outperforming recent open-source models of similar size and matching the performance of models twice its size on math and coding tasks requiring complex reasoning. This achievement is driven by a carefully curated synthetic data recipe emphasizing high-quality math and coding datasets. Compared to its predecessor, Phi-3.5-Mini, Phi-4-Mini features an expanded vocabulary size of 200K tokens to better support multilingual applications, as well as group query attention for more efficient long-sequence generation. Phi-4-Multimodal is a multimodal model that integrates text, vision, and speech/audio input modalities into a single model. Its novel modality extension approach leverages LoRA adapters and modality-specific routers to allow multiple inference modes combining various modalities without interference. For example, it now ranks first in the OpenASR leaderboard to date, although the LoRA component of the speech/audio modality has just 460 million parameters. Phi-4-Multimodal supports scenarios involving (vision + language), (vision + speech), and (speech/audio) inputs, outperforming larger vision-language and speech-language models on a wide range of tasks. Additionally, we experiment to further train Phi-4-Mini to enhance its reasoning capabilities. Despite its compact 3.8-billion-parameter size, this experimental version achieves reasoning performance on par with or surpassing significantly larger models, including DeepSeek-R1-Distill-Qwen-7B and DeepSeek-R1-Distill-Llama-8B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01743v2-abstract-full').style.display = 'none'; document.getElementById('2503.01743v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01146">arXiv:2503.01146</a> <span> [<a href="https://arxiv.org/pdf/2503.01146">pdf</a>, <a href="https://arxiv.org/format/2503.01146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Deep Reinforcement Learning-based Robot Navigation Generalization through Scenario Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shanze Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Mingao Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhibo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianghui Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaoyu Shen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hailong Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01146v1-abstract-short" style="display: inline;"> This work focuses on enhancing the generalization performance of deep reinforcement learning-based robot navigation in unseen environments. We present a novel data augmentation approach called scenario augmentation, which enables robots to navigate effectively across diverse settings without altering the training scenario. The method operates by mapping the robot's observation into an imagined spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01146v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01146v1-abstract-full" style="display: none;"> This work focuses on enhancing the generalization performance of deep reinforcement learning-based robot navigation in unseen environments. We present a novel data augmentation approach called scenario augmentation, which enables robots to navigate effectively across diverse settings without altering the training scenario. The method operates by mapping the robot's observation into an imagined space, generating an imagined action based on this transformed observation, and then remapping this action back to the real action executed in simulation. Through scenario augmentation, we conduct extensive comparative experiments to investigate the underlying causes of suboptimal navigation behaviors in unseen environments. Our analysis indicates that limited training scenarios represent the primary factor behind these undesired behaviors. Experimental results confirm that scenario augmentation substantially enhances the generalization capabilities of deep reinforcement learning-based navigation systems. The improved navigation framework demonstrates exceptional performance by producing near-optimal trajectories with significantly reduced navigation time in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01146v1-abstract-full').style.display = 'none'; document.getElementById('2503.01146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository