Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 5,803 results for author: <span class="mathjax">Zhang, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07509">arXiv:2503.07509</a> <span> [<a href="https://arxiv.org/pdf/2503.07509">pdf</a>, <a href="https://arxiv.org/format/2503.07509">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Interference-Aware Super-Constellation Design for NOMA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vaezi%2C+M">Mojtaba Vaezi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07509v1-abstract-short" style="display: inline;"> Non-orthogonal multiple access (NOMA) has gained significant attention as a potential next-generation multiple access technique. However, its implementation with finite-alphabet inputs faces challenges. Particularly, due to inter-user interference, superimposed constellations may have overlapping symbols leading to high bit error rates when successive interference cancellation (SIC) is applied. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07509v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07509v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07509v1-abstract-full" style="display: none;"> Non-orthogonal multiple access (NOMA) has gained significant attention as a potential next-generation multiple access technique. However, its implementation with finite-alphabet inputs faces challenges. Particularly, due to inter-user interference, superimposed constellations may have overlapping symbols leading to high bit error rates when successive interference cancellation (SIC) is applied. To tackle the issue, this paper employs autoencoders to design interference-aware super-constellations. Unlike conventional methods where superimposed constellation may have overlapping symbols, the proposed autoencoder-based NOMA (AE-NOMA) is trained to design super-constellations with distinguishable symbols at receivers, regardless of channel gains. The proposed architecture removes the need for SIC, allowing maximum likelihood-based approaches to be used instead. The paper presents the conceptual architecture, loss functions, and training strategies for AE-NOMA. Various test results are provided to demonstrate the effectiveness of interference-aware constellations in improving the bit error rate, indicating the adaptability of AE-NOMA to different channel scenarios and its promising potential for implementing NOMA systems <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07509v1-abstract-full').style.display = 'none'; document.getElementById('2503.07509v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at IEEE International Conference on Communications (ICC), 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07282">arXiv:2503.07282</a> <span> [<a href="https://arxiv.org/pdf/2503.07282">pdf</a>, <a href="https://arxiv.org/format/2503.07282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Graph-based Verification Framework for Fact-Checking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yani Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richong Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Z">Zhijie Nie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junfan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuefeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07282v1-abstract-short" style="display: inline;"> Fact-checking plays a crucial role in combating misinformation. Existing methods using large language models (LLMs) for claim decomposition face two key limitations: (1) insufficient decomposition, introducing unnecessary complexity to the verification process, and (2) ambiguity of mentions, leading to incorrect verification results. To address these challenges, we suggest introducing a claim grap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07282v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07282v1-abstract-full" style="display: none;"> Fact-checking plays a crucial role in combating misinformation. Existing methods using large language models (LLMs) for claim decomposition face two key limitations: (1) insufficient decomposition, introducing unnecessary complexity to the verification process, and (2) ambiguity of mentions, leading to incorrect verification results. To address these challenges, we suggest introducing a claim graph consisting of triplets to address the insufficient decomposition problem and reduce mention ambiguity through graph structure. Based on this core idea, we propose a graph-based framework, GraphFC, for fact-checking. The framework features three key components: graph construction, which builds both claim and evidence graphs; graph-guided planning, which prioritizes the triplet verification order; and graph-guided checking, which verifies the triples one by one between claim and evidence graphs. Extensive experiments show that GraphFC enables fine-grained decomposition while resolving referential ambiguities through relational constraints, achieving state-of-the-art performance across three datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07282v1-abstract-full').style.display = 'none'; document.getElementById('2503.07282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13pages, 4figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07197">arXiv:2503.07197</a> <span> [<a href="https://arxiv.org/pdf/2503.07197">pdf</a>, <a href="https://arxiv.org/format/2503.07197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Effective and Efficient Masked Image Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+Z">Zebin You</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Jingyang Ou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaolu Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jun Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongxuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07197v1-abstract-short" style="display: inline;"> Although masked image generation models and masked diffusion models are designed with different motivations and objectives, we observe that they can be unified within a single framework. Building upon this insight, we carefully explore the design space of training and sampling, identifying key factors that contribute to both performance and efficiency. Based on the improvements observed during thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07197v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07197v1-abstract-full" style="display: none;"> Although masked image generation models and masked diffusion models are designed with different motivations and objectives, we observe that they can be unified within a single framework. Building upon this insight, we carefully explore the design space of training and sampling, identifying key factors that contribute to both performance and efficiency. Based on the improvements observed during this exploration, we develop our model, referred to as eMIGM. Empirically, eMIGM demonstrates strong performance on ImageNet generation, as measured by Fr茅chet Inception Distance (FID). In particular, on ImageNet 256x256, with similar number of function evaluations (NFEs) and model parameters, eMIGM outperforms the seminal VAR. Moreover, as NFE and model parameters increase, eMIGM achieves performance comparable to the state-of-the-art continuous diffusion models while requiring less than 40% of the NFE. Additionally, on ImageNet 512x512, with only about 60% of the NFE, eMIGM outperforms the state-of-the-art continuous diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07197v1-abstract-full').style.display = 'none'; document.getElementById('2503.07197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07110">arXiv:2503.07110</a> <span> [<a href="https://arxiv.org/pdf/2503.07110">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A LSTM-Transformer Model for pulsation control of pVADs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=E%2C+C">Chaoran E</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chenghan Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuyang Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+P">Peixin Hua</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiwen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07110v1-abstract-short" style="display: inline;"> Methods: A method of the pulsation for a pVAD is proposed (AP-pVAD Model). AP-pVAD Model consists of two parts: NPQ Model and LSTM-Transformer Model. (1)The NPQ Model determines the mathematical relationship between motor speed, pressure, and flow rate for the pVAD. (2)The Attention module of Transformer neural network is integrated into the LSTM neural network to form the new LSTM-Transformer Mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07110v1-abstract-full').style.display = 'inline'; document.getElementById('2503.07110v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07110v1-abstract-full" style="display: none;"> Methods: A method of the pulsation for a pVAD is proposed (AP-pVAD Model). AP-pVAD Model consists of two parts: NPQ Model and LSTM-Transformer Model. (1)The NPQ Model determines the mathematical relationship between motor speed, pressure, and flow rate for the pVAD. (2)The Attention module of Transformer neural network is integrated into the LSTM neural network to form the new LSTM-Transformer Model to predict the pulsation time characteristic points for adjusting the motor speed of the pVAD. Results: The AP-pVAD Model is validated in three hydraulic experiments and an animal experiment. (1)The pressure provided by pVAD calculated with the NPQ Model has a maximum error of only 2.15 mmHg compared to the expected values. (2)The pulsation time characteristic points predicted by the LSTM-Transformer Model shows a maximum prediction error of 1.78ms, which is significantly lower than other methods. (3)The in-vivo test of pVAD in animal experiment has significant improvements in aortic pressure. Animals survive for over 27 hours after the initiation of pVAD operation. Conclusion: (1)For a given pVAD, motor speed has a linear relationship with pressure and a quadratic relationship with flow. (2)Deep learning can be used to predict pulsation characteristic time points, with the LSTM-Transformer Model demonstrating minimal prediction error and better robust performance under conditions of limited dataset sizes, elevated noise levels, and diverse hyperparameter combinations, demonstrating its feasibility and effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07110v1-abstract-full').style.display = 'none'; document.getElementById('2503.07110v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06989">arXiv:2503.06989</a> <span> [<a href="https://arxiv.org/pdf/2503.06989">pdf</a>, <a href="https://arxiv.org/format/2503.06989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Utilizing Jailbreak Probability to Attack and Safeguard Multimodal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenzhuo Xu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhipeng Wei</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiongtao Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Deyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dongdong Yang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Quanchen Zou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangzheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06989v1-abstract-short" style="display: inline;"> Recently, Multimodal Large Language Models (MLLMs) have demonstrated their superior ability in understanding multimodal contents. However, they remain vulnerable to jailbreak attacks, which exploit weaknesses in their safety alignment to generate harmful responses. Previous studies categorize jailbreaks as successful or failed based on whether responses contain malicious content. However, given th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06989v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06989v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06989v1-abstract-full" style="display: none;"> Recently, Multimodal Large Language Models (MLLMs) have demonstrated their superior ability in understanding multimodal contents. However, they remain vulnerable to jailbreak attacks, which exploit weaknesses in their safety alignment to generate harmful responses. Previous studies categorize jailbreaks as successful or failed based on whether responses contain malicious content. However, given the stochastic nature of MLLM responses, this binary classification of an input's ability to jailbreak MLLMs is inappropriate. Derived from this viewpoint, we introduce jailbreak probability to quantify the jailbreak potential of an input, which represents the likelihood that MLLMs generated a malicious response when prompted with this input. We approximate this probability through multiple queries to MLLMs. After modeling the relationship between input hidden states and their corresponding jailbreak probability using Jailbreak Probability Prediction Network (JPPN), we use continuous jailbreak probability for optimization. Specifically, we propose Jailbreak-Probability-based Attack (JPA) that optimizes adversarial perturbations on inputs to maximize jailbreak probability. To counteract attacks, we also propose two defensive methods: Jailbreak-Probability-based Finetuning (JPF) and Jailbreak-Probability-based Defensive Noise (JPDN), which minimizes jailbreak probability in the MLLM parameters and input space, respectively. Extensive experiments show that (1) JPA yields improvements (up to 28.38\%) under both white and black box settings compared to previous methods with small perturbation bounds and few iterations. (2) JPF and JPDN significantly reduce jailbreaks by at most over 60\%. Both of the above results demonstrate the significance of introducing jailbreak probability to make nuanced distinctions among input jailbreak abilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06989v1-abstract-full').style.display = 'none'; document.getElementById('2503.06989v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06929">arXiv:2503.06929</a> <span> [<a href="https://arxiv.org/pdf/2503.06929">pdf</a>, <a href="https://arxiv.org/format/2503.06929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Risk Management">q-fin.RM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Assessing Uncertainty in Stock Returns: A Gaussian Mixture Distribution-Based Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanlong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shao-Lun Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D+D">Danny Dongning Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao-Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06929v1-abstract-short" style="display: inline;"> This study seeks to advance the understanding and prediction of stock market return uncertainty through the application of advanced deep learning techniques. We introduce a novel deep learning model that utilizes a Gaussian mixture distribution to capture the complex, time-varying nature of asset return distributions in the Chinese stock market. By incorporating the Gaussian mixture distribution,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06929v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06929v1-abstract-full" style="display: none;"> This study seeks to advance the understanding and prediction of stock market return uncertainty through the application of advanced deep learning techniques. We introduce a novel deep learning model that utilizes a Gaussian mixture distribution to capture the complex, time-varying nature of asset return distributions in the Chinese stock market. By incorporating the Gaussian mixture distribution, our approach effectively characterizes short-term fluctuations and non-traditional features of stock returns, such as skewness and heavy tails, that are often overlooked by traditional models. Compared to GARCH models and their variants, our method demonstrates superior performance in volatility estimation, particularly during periods of heightened market volatility. It provides more accurate volatility forecasts and offers unique risk insights for different assets, thereby deepening the understanding of return uncertainty. Additionally, we propose a novel use of Code embedding which utilizes a bag-of-words approach to train hidden representations of stock codes and transforms the uncertainty attributes of stocks into high-dimensional vectors. These vectors are subsequently reduced to two dimensions, allowing the observation of similarity among different stocks. This visualization facilitates the identification of asset clusters with similar risk profiles, offering valuable insights for portfolio management and risk mitigation. Since we predict the uncertainty of returns by estimating their latent distribution, it is challenging to evaluate the return distribution when the true distribution is unobservable. However, we can measure it through the CRPS to assess how well the predicted distribution matches the true returns, and through MSE and QLIKE metrics to evaluate the error between the volatility level of the predicted distribution and proxy measures of true volatility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06929v1-abstract-full').style.display = 'none'; document.getElementById('2503.06929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06928">arXiv:2503.06928</a> <span> [<a href="https://arxiv.org/pdf/2503.06928">pdf</a>, <a href="https://arxiv.org/format/2503.06928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Trading and Market Microstructure">q-fin.TR</span> </div> </div> <p class="title is-5 mathjax"> FinTSBridge: A New Evaluation Suite for Real-world Financial Prediction with Advanced Time Series Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanlong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tiantian Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongkang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shao-Lun Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D+D">Danny Dongning Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiao-Ping Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06928v1-abstract-short" style="display: inline;"> Despite the growing attention to time series forecasting in recent years, many studies have proposed various solutions to address the challenges encountered in time series prediction, aiming to improve forecasting performance. However, effectively applying these time series forecasting models to the field of financial asset pricing remains a challenging issue. There is still a need for a bridge to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06928v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06928v1-abstract-full" style="display: none;"> Despite the growing attention to time series forecasting in recent years, many studies have proposed various solutions to address the challenges encountered in time series prediction, aiming to improve forecasting performance. However, effectively applying these time series forecasting models to the field of financial asset pricing remains a challenging issue. There is still a need for a bridge to connect cutting-edge time series forecasting models with financial asset pricing. To bridge this gap, we have undertaken the following efforts: 1) We constructed three datasets from the financial domain; 2) We selected over ten time series forecasting models from recent studies and validated their performance in financial time series; 3) We developed new metrics, msIC and msIR, in addition to MSE and MAE, to showcase the time series correlation captured by the models; 4) We designed financial-specific tasks for these three datasets and assessed the practical performance and application potential of these forecasting models in important financial problems. We hope the developed new evaluation suite, FinTSBridge, can provide valuable insights into the effectiveness and robustness of advanced forecasting models in finanical domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06928v1-abstract-full').style.display = 'none'; document.getElementById('2503.06928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025 Workshop Advances in Financial AI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06861">arXiv:2503.06861</a> <span> [<a href="https://arxiv.org/pdf/2503.06861">pdf</a>, <a href="https://arxiv.org/format/2503.06861">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhanced Multi-Tuple Extraction for Alloys: Integrating Pointer Networks and Augmented Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hei%2C+M">Mengzhe Hei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhouran Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingbao Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yan Pan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yongqian Peng</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yicong Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shuxin Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06861v1-abstract-short" style="display: inline;"> Extracting high-quality structured information from scientific literature is crucial for advancing material design through data-driven methods. Despite the considerable research in natural language processing for dataset extraction, effective approaches for multi-tuple extraction in scientific literature remain scarce due to the complex interrelations of tuples and contextual ambiguities. In the s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06861v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06861v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06861v1-abstract-full" style="display: none;"> Extracting high-quality structured information from scientific literature is crucial for advancing material design through data-driven methods. Despite the considerable research in natural language processing for dataset extraction, effective approaches for multi-tuple extraction in scientific literature remain scarce due to the complex interrelations of tuples and contextual ambiguities. In the study, we illustrate the multi-tuple extraction of mechanical properties from multi-principal-element alloys and presents a novel framework that combines an entity extraction model based on MatSciBERT with pointer networks and an allocation model utilizing inter- and intra-entity attention. Our rigorous experiments on tuple extraction demonstrate impressive F1 scores of 0.963, 0.947, 0.848, and 0.753 across datasets with 1, 2, 3, and 4 tuples, confirming the effectiveness of the model. Furthermore, an F1 score of 0.854 was achieved on a randomly curated dataset. These results highlight the model's capacity to deliver precise and structured information, offering a robust alternative to large language models and equipping researchers with essential data for fostering data-driven innovations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06861v1-abstract-full').style.display = 'none'; document.getElementById('2503.06861v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> 410072 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06839">arXiv:2503.06839</a> <span> [<a href="https://arxiv.org/pdf/2503.06839">pdf</a>, <a href="https://arxiv.org/format/2503.06839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AttFC: Attention Fully-Connected Layer for Large-Scale Face Recognition with One GPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhuowen Zheng</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Y">Yain-Whar Si</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaochen Yuan</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+J">Junwei Duan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaofan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xueyuan Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06839v1-abstract-short" style="display: inline;"> Nowadays, with the advancement of deep neural networks (DNNs) and the availability of large-scale datasets, the face recognition (FR) model has achieved exceptional performance. However, since the parameter magnitude of the fully connected (FC) layer directly depends on the number of identities in the dataset. If training the FR model on large-scale datasets, the size of the model parameter will b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06839v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06839v1-abstract-full" style="display: none;"> Nowadays, with the advancement of deep neural networks (DNNs) and the availability of large-scale datasets, the face recognition (FR) model has achieved exceptional performance. However, since the parameter magnitude of the fully connected (FC) layer directly depends on the number of identities in the dataset. If training the FR model on large-scale datasets, the size of the model parameter will be excessively huge, leading to substantial demand for computational resources, such as time and memory. This paper proposes the attention fully connected (AttFC) layer, which could significantly reduce computational resources. AttFC employs an attention loader to generate the generative class center (GCC), and dynamically store the class center with Dynamic Class Container (DCC). DCC only stores a small subset of all class centers in FC, thus its parameter count is substantially less than the FC layer. Also, training face recognition models on large-scale datasets with one GPU often encounter out-of-memory (OOM) issues. AttFC overcomes this and achieves comparable performance to state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06839v1-abstract-full').style.display = 'none'; document.getElementById('2503.06839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06680">arXiv:2503.06680</a> <span> [<a href="https://arxiv.org/pdf/2503.06680">pdf</a>, <a href="https://arxiv.org/format/2503.06680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhongxin Guo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Shaoguang Mao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wen Luo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+G">Guangyue Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yangyu Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Houfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Scarlett Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06680v1-abstract-short" style="display: inline;"> Implementing new features in repository-level codebases is a crucial application of code generation models. However, current benchmarks lack a dedicated evaluation framework for this capability. To fill this gap, we introduce FEA-Bench, a benchmark designed to assess the ability of large language models (LLMs) to perform incremental development within code repositories. We collect pull requests fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06680v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06680v1-abstract-full" style="display: none;"> Implementing new features in repository-level codebases is a crucial application of code generation models. However, current benchmarks lack a dedicated evaluation framework for this capability. To fill this gap, we introduce FEA-Bench, a benchmark designed to assess the ability of large language models (LLMs) to perform incremental development within code repositories. We collect pull requests from 83 GitHub repositories and use rule-based and intent-based filtering to construct task instances focused on new feature development. Each task instance containing code changes is paired with relevant unit test files to ensure that the solution can be verified. The feature implementation requires LLMs to simultaneously possess code completion capabilities for new components and code editing abilities for other relevant parts in the code repository, providing a more comprehensive evaluation method of LLMs' automated software engineering capabilities. Experimental results show that LLMs perform significantly worse in the FEA-Bench, highlighting considerable challenges in such repository-level incremental code development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06680v1-abstract-full').style.display = 'none'; document.getElementById('2503.06680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06661">arXiv:2503.06661</a> <span> [<a href="https://arxiv.org/pdf/2503.06661">pdf</a>, <a href="https://arxiv.org/format/2503.06661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AA-CLIP: Enhancing Zero-shot Anomaly Detection via Anomaly-Aware CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenxin Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Q">Qingsong Yao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+F">Fenghe Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenxu Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yingtai Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihang Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+K">S. Kevin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06661v1-abstract-short" style="display: inline;"> Anomaly detection (AD) identifies outliers for applications like defect and lesion detection. While CLIP shows promise for zero-shot AD tasks due to its strong generalization capabilities, its inherent Anomaly-Unawareness leads to limited discrimination between normal and abnormal features. To address this problem, we propose Anomaly-Aware CLIP (AA-CLIP), which enhances CLIP's anomaly discriminati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06661v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06661v1-abstract-full" style="display: none;"> Anomaly detection (AD) identifies outliers for applications like defect and lesion detection. While CLIP shows promise for zero-shot AD tasks due to its strong generalization capabilities, its inherent Anomaly-Unawareness leads to limited discrimination between normal and abnormal features. To address this problem, we propose Anomaly-Aware CLIP (AA-CLIP), which enhances CLIP's anomaly discrimination ability in both text and visual spaces while preserving its generalization capability. AA-CLIP is achieved through a straightforward yet effective two-stage approach: it first creates anomaly-aware text anchors to differentiate normal and abnormal semantics clearly, then aligns patch-level visual features with these anchors for precise anomaly localization. This two-stage strategy, with the help of residual adapters, gradually adapts CLIP in a controlled manner, achieving effective AD while maintaining CLIP's class knowledge. Extensive experiments validate AA-CLIP as a resource-efficient solution for zero-shot AD tasks, achieving state-of-the-art results in industrial and medical applications. The code is available at https://github.com/Mwxinnn/AA-CLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06661v1-abstract-full').style.display = 'none'; document.getElementById('2503.06661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CVPR 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06659">arXiv:2503.06659</a> <span> [<a href="https://arxiv.org/pdf/2503.06659">pdf</a>, <a href="https://arxiv.org/format/2503.06659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> PANDA: Parkinson's Assistance and Notification Driving Aid </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+T">Tianyang Wen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xucheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zhirong Wan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jing Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yicheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+N">Ning Su</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaolan Peng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jin Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wei Sun</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Feng Tian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F+M">Franklin Mingzhe Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06659v1-abstract-short" style="display: inline;"> Parkinson's Disease (PD) significantly impacts driving abilities, often leading to early driving cessation or accidents due to reduced motor control and increasing reaction times. To diminish the impact of these symptoms, we developed PANDA (Parkinson's Assistance and Notification Driving Aid), a multi-modality real-time alert system designed to monitor driving patterns continuously and provide im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06659v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06659v1-abstract-full" style="display: none;"> Parkinson's Disease (PD) significantly impacts driving abilities, often leading to early driving cessation or accidents due to reduced motor control and increasing reaction times. To diminish the impact of these symptoms, we developed PANDA (Parkinson's Assistance and Notification Driving Aid), a multi-modality real-time alert system designed to monitor driving patterns continuously and provide immediate alerts for irregular driving behaviors, enhancing driver safety of individuals with PD. The system was developed through a participatory design process with 9 people with PD and 13 non-PD individuals using a driving simulator, which allowed us to identify critical design characteristics and collect detailed data on driving behavior. A user study involving individuals with PD evaluated the effectiveness of PANDA, exploring optimal strategies for delivering alerts and ensuring they are timely and helpful. Our findings demonstrate that PANDA has the potential to enhance the driving safety of individuals with PD, offering a valuable tool for maintaining independence and confidence behind the wheel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06659v1-abstract-full').style.display = 'none'; document.getElementById('2503.06659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06623">arXiv:2503.06623</a> <span> [<a href="https://arxiv.org/pdf/2503.06623">pdf</a>, <a href="https://arxiv.org/format/2503.06623">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Transforming Weather Data from Pixel to Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sijie Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xueliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Junchao Gong</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+R">Ran Tao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+P">Pengfeng Xiao</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06623v1-abstract-short" style="display: inline;"> The increasing impact of climate change and extreme weather events has spurred growing interest in deep learning for weather research. However, existing studies often rely on weather data in pixel space, which presents several challenges such as smooth outputs in model outputs, limited applicability to a single pressure-variable subset (PVS), and high data storage and computational costs. To addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06623v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06623v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06623v1-abstract-full" style="display: none;"> The increasing impact of climate change and extreme weather events has spurred growing interest in deep learning for weather research. However, existing studies often rely on weather data in pixel space, which presents several challenges such as smooth outputs in model outputs, limited applicability to a single pressure-variable subset (PVS), and high data storage and computational costs. To address these challenges, we propose a novel Weather Latent Autoencoder (WLA) that transforms weather data from pixel space to latent space, enabling efficient weather task modeling. By decoupling weather reconstruction from downstream tasks, WLA improves the accuracy and sharpness of weather task model results. The incorporated Pressure-Variable Unified Module transforms multiple PVS into a unified representation, enhancing the adaptability of the model in multiple weather scenarios. Furthermore, weather tasks can be performed in a low-storage latent space of WLA rather than a high-storage pixel space, thus significantly reducing data storage and computational costs. Through extensive experimentation, we demonstrate its superior compression and reconstruction performance, enabling the creation of the ERA5-latent dataset with unified representations of multiple PVS from ERA5 data. The compressed full PVS in the ERA5-latent dataset reduces the original 244.34 TB of data to 0.43 TB. The downstream task further demonstrates that task models can apply to multiple PVS with low data costs in latent space and achieve superior performance compared to models in pixel space. Code, ERA5-latent data, and pre-trained models are available at https://anonymous.4open.science/r/Weather-Latent-Autoencoder-8467. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06623v1-abstract-full').style.display = 'none'; document.getElementById('2503.06623v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06617">arXiv:2503.06617</a> <span> [<a href="https://arxiv.org/pdf/2503.06617">pdf</a>, <a href="https://arxiv.org/format/2503.06617">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pixel to Gaussian: Ultra-Fast Continuous Super-Resolution with 2D Gaussian Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+L">Long Peng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+A">Anran Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenbo Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+P">Peizhe Xia</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xueyuan Dai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Di%2C+X">Xin Di</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoze Sun</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+R">Renjing Pei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+Z">Zheng-Jun Zha</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06617v1-abstract-short" style="display: inline;"> Arbitrary-scale super-resolution (ASSR) aims to reconstruct high-resolution (HR) images from low-resolution (LR) inputs with arbitrary upsampling factors using a single model, addressing the limitations of traditional SR methods constrained to fixed-scale factors (\textit{e.g.}, $\times$ 2). Recent advances leveraging implicit neural representation (INR) have achieved great progress by modeling co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06617v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06617v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06617v1-abstract-full" style="display: none;"> Arbitrary-scale super-resolution (ASSR) aims to reconstruct high-resolution (HR) images from low-resolution (LR) inputs with arbitrary upsampling factors using a single model, addressing the limitations of traditional SR methods constrained to fixed-scale factors (\textit{e.g.}, $\times$ 2). Recent advances leveraging implicit neural representation (INR) have achieved great progress by modeling coordinate-to-pixel mappings. However, the efficiency of these methods may suffer from repeated upsampling and decoding, while their reconstruction fidelity and quality are constrained by the intrinsic representational limitations of coordinate-based functions. To address these challenges, we propose a novel ContinuousSR framework with a Pixel-to-Gaussian paradigm, which explicitly reconstructs 2D continuous HR signals from LR images using Gaussian Splatting. This approach eliminates the need for time-consuming upsampling and decoding, enabling extremely fast arbitrary-scale super-resolution. Once the Gaussian field is built in a single pass, ContinuousSR can perform arbitrary-scale rendering in just 1ms per scale. Our method introduces several key innovations. Through statistical ana <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06617v1-abstract-full').style.display = 'none'; document.getElementById('2503.06617v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06583">arXiv:2503.06583</a> <span> [<a href="https://arxiv.org/pdf/2503.06583">pdf</a>, <a href="https://arxiv.org/format/2503.06583">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> A Modular and Extensible Hardware Platform Prototype for Dynamic Data Physicalisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Ili%C4%87%2C+M">Milan Ili膰</a>, <a href="/search/cs?searchtype=author&query=Signer%2C+B">Beat Signer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06583v1-abstract-short" style="display: inline;"> Dynamic data physicalisation is an emerging field of research, investigating the representation and exploration of data via multiple modalities, beyond traditional visual methods. Despite the development of various data physicalisation applications in recent years, the integration of diverse hardware components remains both time-consuming and costly. Further, there is a lack of solutions for rapid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06583v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06583v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06583v1-abstract-full" style="display: none;"> Dynamic data physicalisation is an emerging field of research, investigating the representation and exploration of data via multiple modalities, beyond traditional visual methods. Despite the development of various data physicalisation applications in recent years, the integration of diverse hardware components remains both time-consuming and costly. Further, there is a lack of solutions for rapid prototyping and experimentation with different dynamic data physicalisation alternatives. To address this problem, we propose a modular and extensible hardware platform for dynamic data physicalisation. This platform introduces a communication architecture that ensures seamless plug-and-play functionality for modules representing different physical variables. We detail the implementation and technical evaluation of a preliminary prototype of our platform, demonstrating its potential to facilitate rapid prototyping and experimentation with various data physicalisation designs. The platform aims to support researchers and developers in the field by providing a versatile and efficient tool for the rapid prototyping and experimentation with different data physicalisation design alternatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06583v1-abstract-full').style.display = 'none'; document.getElementById('2503.06583v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Report number:</span> WISE-2025-01 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06446">arXiv:2503.06446</a> <span> [<a href="https://arxiv.org/pdf/2503.06446">pdf</a>, <a href="https://arxiv.org/format/2503.06446">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> M$^3$amba: CLIP-driven Mamba Model for Multi-modal Remote Sensing Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Mingxiang Cao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+W">Weiying Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Kai Jiang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jie Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06446v1-abstract-short" style="display: inline;"> Multi-modal fusion holds great promise for integrating information from different modalities. However, due to a lack of consideration for modal consistency, existing multi-modal fusion methods in the field of remote sensing still face challenges of incomplete semantic information and low computational efficiency in their fusion designs. Inspired by the observation that the visual language pre-trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06446v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06446v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06446v1-abstract-full" style="display: none;"> Multi-modal fusion holds great promise for integrating information from different modalities. However, due to a lack of consideration for modal consistency, existing multi-modal fusion methods in the field of remote sensing still face challenges of incomplete semantic information and low computational efficiency in their fusion designs. Inspired by the observation that the visual language pre-training model CLIP can effectively extract strong semantic information from visual features, we propose M$^3$amba, a novel end-to-end CLIP-driven Mamba model for multi-modal fusion to address these challenges. Specifically, we introduce CLIP-driven modality-specific adapters in the fusion architecture to avoid the bias of understanding specific domains caused by direct inference, making the original CLIP encoder modality-specific perception. This unified framework enables minimal training to achieve a comprehensive semantic understanding of different modalities, thereby guiding cross-modal feature fusion. To further enhance the consistent association between modality mappings, a multi-modal Mamba fusion architecture with linear complexity and a cross-attention module Cross-SS2D are designed, which fully considers effective and efficient information interaction to achieve complete fusion. Extensive experiments have shown that M$^3$amba has an average performance improvement of at least 5.98\% compared with the state-of-the-art methods in multi-modal hyperspectral image classification tasks in the remote sensing field, while also demonstrating excellent training efficiency, achieving a double improvement in accuracy and efficiency. The code is released at https://github.com/kaka-Cao/M3amba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06446v1-abstract-full').style.display = 'none'; document.getElementById('2503.06446v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06187">arXiv:2503.06187</a> <span> [<a href="https://arxiv.org/pdf/2503.06187">pdf</a>, <a href="https://arxiv.org/format/2503.06187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MSConv: Multiplicative and Subtractive Convolution for Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Si Zhou</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Y">Yain-Whar Si</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaochen Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaofan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Cong Lin</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xueyuan Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06187v1-abstract-short" style="display: inline;"> In Neural Networks, there are various methods of feature fusion. Different strategies can significantly affect the effectiveness of feature representation, consequently influencing the ability of model to extract representative and discriminative features. In the field of face recognition, traditional feature fusion methods include feature concatenation and feature addition. Recently, various atte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06187v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06187v1-abstract-full" style="display: none;"> In Neural Networks, there are various methods of feature fusion. Different strategies can significantly affect the effectiveness of feature representation, consequently influencing the ability of model to extract representative and discriminative features. In the field of face recognition, traditional feature fusion methods include feature concatenation and feature addition. Recently, various attention mechanism-based fusion strategies have emerged. However, we found that these methods primarily focus on the important features in the image, referred to as salient features in this paper, while neglecting another equally important set of features for image recognition tasks, which we term differential features. This may cause the model to overlook critical local differences when dealing with complex facial samples. Therefore, in this paper, we propose an efficient convolution module called MSConv (Multiplicative and Subtractive Convolution), designed to balance the learning of model about salient and differential features. Specifically, we employ multi-scale mixed convolution to capture both local and broader contextual information from face images, and then utilize Multiplication Operation (MO) and Subtraction Operation (SO) to extract salient and differential features, respectively. Experimental results demonstrate that by integrating both salient and differential features, MSConv outperforms models that only focus on salient features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06187v1-abstract-full').style.display = 'none'; document.getElementById('2503.06187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06184">arXiv:2503.06184</a> <span> [<a href="https://arxiv.org/pdf/2503.06184">pdf</a>, <a href="https://arxiv.org/format/2503.06184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sample-aware Adaptive Structured Pruning for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+J">Jun Kong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xinge Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuejie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06184v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved outstanding performance in natural language processing, but enormous model sizes and high computational costs limit their practical deployment. Structured pruning can effectively reduce the resource demands for deployment by removing redundant model parameters. However, the randomly selected calibration data and fixed single importance estimation metrics… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06184v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06184v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved outstanding performance in natural language processing, but enormous model sizes and high computational costs limit their practical deployment. Structured pruning can effectively reduce the resource demands for deployment by removing redundant model parameters. However, the randomly selected calibration data and fixed single importance estimation metrics in existing structured pruning methods lead to degraded performance of pruned models. This study introduces AdaPruner, a sample-aware adaptive structured pruning framework for LLMs, aiming to optimize the calibration data and importance estimation metrics in the structured pruning process. Specifically, AdaPruner effectively removes redundant parameters from LLMs by constructing a structured pruning solution space and then employing Bayesian optimization to adaptively search for the optimal calibration data and importance estimation metrics. Experimental results show that the AdaPruner outperforms existing structured pruning methods on a family of LLMs with varying pruning ratios, demonstrating its applicability and robustness. Remarkably, at a 20\% pruning ratio, the model pruned with AdaPruner maintains 97\% of the performance of the unpruned model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06184v1-abstract-full').style.display = 'none'; document.getElementById('2503.06184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06118">arXiv:2503.06118</a> <span> [<a href="https://arxiv.org/pdf/2503.06118">pdf</a>, <a href="https://arxiv.org/format/2503.06118">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SecureGS: Boosting the Security and Fidelity of 3D Gaussian Splatting Steganography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+J">Jiarui Meng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhipei Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuzhou Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanmin Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ronggang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06118v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has emerged as a premier method for 3D representation due to its real-time rendering and high-quality outputs, underscoring the critical need to protect the privacy of 3D assets. Traditional NeRF steganography methods fail to address the explicit nature of 3DGS since its point cloud files are publicly accessible. Existing GS steganography solutions mitigate some issues… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06118v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06118v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06118v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has emerged as a premier method for 3D representation due to its real-time rendering and high-quality outputs, underscoring the critical need to protect the privacy of 3D assets. Traditional NeRF steganography methods fail to address the explicit nature of 3DGS since its point cloud files are publicly accessible. Existing GS steganography solutions mitigate some issues but still struggle with reduced rendering fidelity, increased computational demands, and security flaws, especially in the security of the geometric structure of the visualized point cloud. To address these demands, we propose a SecureGS, a secure and efficient 3DGS steganography framework inspired by Scaffold-GS's anchor point design and neural decoding. SecureGS uses a hybrid decoupled Gaussian encryption mechanism to embed offsets, scales, rotations, and RGB attributes of the hidden 3D Gaussian points in anchor point features, retrievable only by authorized users through privacy-preserving neural networks. To further enhance security, we propose a density region-aware anchor growing and pruning strategy that adaptively locates optimal hiding regions without exposing hidden information. Extensive experiments show that SecureGS significantly surpasses existing GS steganography methods in rendering fidelity, speed, and security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06118v1-abstract-full').style.display = 'none'; document.getElementById('2503.06118v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06106">arXiv:2503.06106</a> <span> [<a href="https://arxiv.org/pdf/2503.06106">pdf</a>, <a href="https://arxiv.org/format/2503.06106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vision-aware Multimodal Prompt Tuning for Uploadable Multi-source Few-shot Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kuanghong Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+K">Kangjian He</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuejie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06106v1-abstract-short" style="display: inline;"> Conventional multi-source domain few-shot adaptation (MFDA) faces the challenge of further reducing the load on edge-side devices in low-resource scenarios. Considering the native language-supervised advantage of CLIP and the plug-and-play nature of prompt to transfer CLIP efficiently, this paper introduces an uploadable multi-source few-shot domain adaptation (UMFDA) schema. It belongs to a decen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06106v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06106v1-abstract-full" style="display: none;"> Conventional multi-source domain few-shot adaptation (MFDA) faces the challenge of further reducing the load on edge-side devices in low-resource scenarios. Considering the native language-supervised advantage of CLIP and the plug-and-play nature of prompt to transfer CLIP efficiently, this paper introduces an uploadable multi-source few-shot domain adaptation (UMFDA) schema. It belongs to a decentralized edge collaborative learning in the edge-side models that must maintain a low computational load. And only a limited amount of annotations in source domain data is provided, with most of the data being unannotated. Further, this paper proposes a vision-aware multimodal prompt tuning framework (VAMP) under the decentralized schema, where the vision-aware prompt guides the text domain-specific prompt to maintain semantic discriminability and perceive the domain information. The cross-modal semantic and domain distribution alignment losses optimize each edge-side model, while text classifier consistency and semantic diversity losses promote collaborative learning among edge-side models. Extensive experiments were conducted on OfficeHome and DomainNet datasets to demonstrate the effectiveness of the proposed VAMP in the UMFDA, which outperformed the previous prompt tuning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06106v1-abstract-full').style.display = 'none'; document.getElementById('2503.06106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06085">arXiv:2503.06085</a> <span> [<a href="https://arxiv.org/pdf/2503.06085">pdf</a>, <a href="https://arxiv.org/format/2503.06085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Attribute Multi-Grained Adaptation of Pre-Trained Language Models for Text Understanding from Bayesian Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">You Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Liang-Chih Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuejie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06085v1-abstract-short" style="display: inline;"> Current neural networks often employ multi-domain-learning or attribute-injecting mechanisms to incorporate non-independent and identically distributed (non-IID) information for text understanding tasks by capturing individual characteristics and the relationships among samples. However, the extent of the impact of non-IID information and how these methods affect pre-trained language models (PLMs)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06085v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06085v1-abstract-full" style="display: none;"> Current neural networks often employ multi-domain-learning or attribute-injecting mechanisms to incorporate non-independent and identically distributed (non-IID) information for text understanding tasks by capturing individual characteristics and the relationships among samples. However, the extent of the impact of non-IID information and how these methods affect pre-trained language models (PLMs) remains unclear. This study revisits the assumption that non-IID information enhances PLMs to achieve performance improvements from a Bayesian perspective, which unearths and integrates non-IID and IID features. Furthermore, we proposed a multi-attribute multi-grained framework for PLM adaptations (M2A), which combines multi-attribute and multi-grained views to mitigate uncertainty in a lightweight manner. We evaluate M2A through prevalent text-understanding datasets and demonstrate its superior performance, mainly when data are implicitly non-IID, and PLMs scale larger. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06085v1-abstract-full').style.display = 'none'; document.getElementById('2503.06085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06080">arXiv:2503.06080</a> <span> [<a href="https://arxiv.org/pdf/2503.06080">pdf</a>, <a href="https://arxiv.org/ps/2503.06080">ps</a>, <a href="https://arxiv.org/format/2503.06080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Fluid Antenna Meets RIS: Random Matrix Analysis and Two-Timescale Design for Multi-User Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Dongfang Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingjing Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shenghui Song</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+D+W+K">Derrick Wing Kwan Ng</a>, <a href="/search/cs?searchtype=author&query=Debbah%2C+M">M茅rouane Debbah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06080v1-abstract-short" style="display: inline;"> The reconfigurability of fluid antenna systems (FASs) and reconfigurable intelligent surfaces (RISs) provides significant flexibility in optimizing channel conditions by jointly adjusting the positions of fluid antennas and the phase shifts of RISs. However, it is challenging to acquire the instantaneous channel state information (CSI) for both fluid antennas and RISs, while frequent adjustment of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06080v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06080v1-abstract-full" style="display: none;"> The reconfigurability of fluid antenna systems (FASs) and reconfigurable intelligent surfaces (RISs) provides significant flexibility in optimizing channel conditions by jointly adjusting the positions of fluid antennas and the phase shifts of RISs. However, it is challenging to acquire the instantaneous channel state information (CSI) for both fluid antennas and RISs, while frequent adjustment of antenna positions and phase shifts will significantly increase the system complexity. To tackle this issue, this paper investigates the two-timescale design for FAS-RIS multi-user systems with linear precoding, where only the linear precoder design requires instantaneous CSI of the end-to-end channel, while the FAS and RIS optimization relies on statistical CSI. The main challenge comes from the complex structure of channel and inverse operations in linear precoding, such as regularized zero-forcing (RZF) and zero-forcing (ZF). Leveraging on random matrix theory (RMT), we first investigate the fundamental limits of FAS-RIS systems with RZF/ZF precoding by deriving the ergodic sum rate (ESR). This result is utilized to determine the minimum number of activated antennas to achieve a given ESR. Based on the evaluation result, we propose an algorithm to jointly optimize the antenna selection, regularization factor of RZF, and phase shifts at the RIS. Numerical results validate the accuracy of performance evaluation and demonstrate that the performance gain brought by joint FAS and RIS design is more pronounced with a larger number of users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06080v1-abstract-full').style.display = 'none'; document.getElementById('2503.06080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06028">arXiv:2503.06028</a> <span> [<a href="https://arxiv.org/pdf/2503.06028">pdf</a>, <a href="https://arxiv.org/format/2503.06028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data-Free Black-Box Federated Learning via Zeroth-Order Gradient Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xinge Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuejie Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06028v1-abstract-short" style="display: inline;"> Federated learning (FL) enables decentralized clients to collaboratively train a global model under the orchestration of a central server without exposing their individual data. However, the iterative exchange of model parameters between the server and clients imposes heavy communication burdens, risks potential privacy leakage, and even precludes collaboration among heterogeneous clients. Distill… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06028v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06028v1-abstract-full" style="display: none;"> Federated learning (FL) enables decentralized clients to collaboratively train a global model under the orchestration of a central server without exposing their individual data. However, the iterative exchange of model parameters between the server and clients imposes heavy communication burdens, risks potential privacy leakage, and even precludes collaboration among heterogeneous clients. Distillation-based FL tackles these challenges by exchanging low-dimensional model outputs rather than model parameters, yet it highly relies on a task-relevant auxiliary dataset that is often not available in practice. Data-free FL attempts to overcome this limitation by training a server-side generator to directly synthesize task-specific data samples for knowledge transfer. However, the update rule of the generator requires clients to share on-device models for white-box access, which greatly compromises the advantages of distillation-based FL. This motivates us to explore a data-free and black-box FL framework via Zeroth-order Gradient Estimation (FedZGE), which estimates the gradients after flowing through on-device models in a black-box optimization manner to complete the training of the generator in terms of fidelity, transferability, diversity, and equilibrium, without involving any auxiliary data or sharing any model parameters, thus combining the advantages of both distillation-based FL and data-free FL. Experiments on large-scale image classification datasets and network architectures demonstrate the superiority of FedZGE in terms of data heterogeneity, model heterogeneity, communication efficiency, and privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06028v1-abstract-full').style.display = 'none'; document.getElementById('2503.06028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06021">arXiv:2503.06021</a> <span> [<a href="https://arxiv.org/pdf/2503.06021">pdf</a>, <a href="https://arxiv.org/format/2503.06021">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedEM: A Privacy-Preserving Framework for Concurrent Utility Preservation in Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+M">Mingcong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaojin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06021v1-abstract-short" style="display: inline;"> Federated Learning (FL) enables collaborative training of models across distributed clients without sharing local data, addressing privacy concerns in decentralized systems. However, the gradient-sharing process exposes private data to potential leakage, compromising FL's privacy guarantees in real-world applications. To address this issue, we propose Federated Error Minimization (FedEM), a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06021v1-abstract-full').style.display = 'inline'; document.getElementById('2503.06021v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06021v1-abstract-full" style="display: none;"> Federated Learning (FL) enables collaborative training of models across distributed clients without sharing local data, addressing privacy concerns in decentralized systems. However, the gradient-sharing process exposes private data to potential leakage, compromising FL's privacy guarantees in real-world applications. To address this issue, we propose Federated Error Minimization (FedEM), a novel algorithm that incorporates controlled perturbations through adaptive noise injection. This mechanism effectively mitigates gradient leakage attacks while maintaining model performance. Experimental results on benchmark datasets demonstrate that FedEM significantly reduces privacy risks and preserves model accuracy, achieving a robust balance between privacy protection and utility preservation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06021v1-abstract-full').style.display = 'none'; document.getElementById('2503.06021v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05727">arXiv:2503.05727</a> <span> [<a href="https://arxiv.org/pdf/2503.05727">pdf</a>, <a href="https://arxiv.org/format/2503.05727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Toward Integrated Solutions: A Systematic Interdisciplinary Review of Cybergrooming Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+H">Heajun An</a>, <a href="/search/cs?searchtype=author&query=Silva%2C+M">Marcos Silva</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Arav Singh</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minqian Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qadir%2C+S">Sarvech Qadir</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S+W">Sang Won Lee</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lifu Huang</a>, <a href="/search/cs?searchtype=author&query=Wisnieswski%2C+P">Pamela Wisnieswski</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jin-Hee Cho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05727v1-abstract-short" style="display: inline;"> Cybergrooming exploits minors through online trust-building, yet research remains fragmented, limiting holistic prevention. Social sciences focus on behavioral insights, while computational methods emphasize detection, but their integration remains insufficient. This review systematically synthesizes both fields using the PRISMA framework to enhance clarity, reproducibility, and cross-disciplinary… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05727v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05727v1-abstract-full" style="display: none;"> Cybergrooming exploits minors through online trust-building, yet research remains fragmented, limiting holistic prevention. Social sciences focus on behavioral insights, while computational methods emphasize detection, but their integration remains insufficient. This review systematically synthesizes both fields using the PRISMA framework to enhance clarity, reproducibility, and cross-disciplinary collaboration. Findings show that qualitative methods offer deep insights but are resource-intensive, machine learning models depend on data quality, and standard metrics struggle with imbalance and cultural nuances. By bridging these gaps, this review advances interdisciplinary cybergrooming research, guiding future efforts toward more effective prevention and detection strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05727v1-abstract-full').style.display = 'none'; document.getElementById('2503.05727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05713">arXiv:2503.05713</a> <span> [<a href="https://arxiv.org/pdf/2503.05713">pdf</a>, <a href="https://arxiv.org/format/2503.05713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Beyond English: Unveiling Multilingual Bias in LLM Copyright Compliance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yupeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yixian Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qian Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05713v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have raised significant concerns regarding the fair use of copyright-protected content. While prior studies have examined the extent to which LLMs reproduce copyrighted materials, they have predominantly focused on English, neglecting multilingual dimensions of copyright protection. In this work, we investigate multilingual biases in LLM copyright protection by address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05713v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05713v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have raised significant concerns regarding the fair use of copyright-protected content. While prior studies have examined the extent to which LLMs reproduce copyrighted materials, they have predominantly focused on English, neglecting multilingual dimensions of copyright protection. In this work, we investigate multilingual biases in LLM copyright protection by addressing two key questions: (1) Do LLMs exhibit bias in protecting copyrighted works across languages? (2) Is it easier to elicit copyrighted content using prompts in specific languages? To explore these questions, we construct a dataset of popular song lyrics in English, French, Chinese, and Korean and systematically probe seven LLMs using prompts in these languages. Our findings reveal significant imbalances in LLMs' handling of copyrighted content, both in terms of the language of the copyrighted material and the language of the prompt. These results highlight the need for further research and development of more robust, language-agnostic copyright protection mechanisms to ensure fair and consistent protection across languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05713v1-abstract-full').style.display = 'none'; document.getElementById('2503.05713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05689">arXiv:2503.05689</a> <span> [<a href="https://arxiv.org/pdf/2503.05689">pdf</a>, <a href="https://arxiv.org/format/2503.05689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GoalFlow: Goal-Driven Flow Matching for Multimodal Trajectories Generation in End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+Z">Zebin Xing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yang Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+X">Xiaoxiao Long</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05689v2-abstract-short" style="display: inline;"> We propose GoalFlow, an end-to-end autonomous driving method for generating high-quality multimodal trajectories. In autonomous driving scenarios, there is rarely a single suitable trajectory. Recent methods have increasingly focused on modeling multimodal trajectory distributions. However, they suffer from trajectory selection complexity and reduced trajectory quality due to high trajectory diver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05689v2-abstract-full').style.display = 'inline'; document.getElementById('2503.05689v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05689v2-abstract-full" style="display: none;"> We propose GoalFlow, an end-to-end autonomous driving method for generating high-quality multimodal trajectories. In autonomous driving scenarios, there is rarely a single suitable trajectory. Recent methods have increasingly focused on modeling multimodal trajectory distributions. However, they suffer from trajectory selection complexity and reduced trajectory quality due to high trajectory divergence and inconsistencies between guidance and scene information. To address these issues, we introduce GoalFlow, a novel method that effectively constrains the generative process to produce high-quality, multimodal trajectories. To resolve the trajectory divergence problem inherent in diffusion-based methods, GoalFlow constrains the generated trajectories by introducing a goal point. GoalFlow establishes a novel scoring mechanism that selects the most appropriate goal point from the candidate points based on scene information. Furthermore, GoalFlow employs an efficient generative method, Flow Matching, to generate multimodal trajectories, and incorporates a refined scoring mechanism to select the optimal trajectory from the candidates. Our experimental results, validated on the Navsim\cite{Dauner2024_navsim}, demonstrate that GoalFlow achieves state-of-the-art performance, delivering robust multimodal trajectories for autonomous driving. GoalFlow achieved PDMS of 90.3, significantly surpassing other methods. Compared with other diffusion-policy-based methods, our approach requires only a single denoising step to obtain excellent performance. The code is available at https://github.com/YvanYin/GoalFlow. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05689v2-abstract-full').style.display = 'none'; document.getElementById('2503.05689v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05373">arXiv:2503.05373</a> <span> [<a href="https://arxiv.org/pdf/2503.05373">pdf</a>, <a href="https://arxiv.org/format/2503.05373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Semantic Type Dependencies for Clinical Named Entity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Le%2C+L">Linh Le</a>, <a href="/search/cs?searchtype=author&query=Zuccon%2C+G">Guido Zuccon</a>, <a href="/search/cs?searchtype=author&query=Demartini%2C+G">Gianluca Demartini</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Genghong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xia Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05373v1-abstract-short" style="display: inline;"> Previous work on clinical relation extraction from free-text sentences leveraged information about semantic types from clinical knowledge bases as a part of entity representations. In this paper, we exploit additional evidence by also making use of domain-specific semantic type dependencies. We encode the relation between a span of tokens matching a Unified Medical Language System (UMLS) concept a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05373v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05373v1-abstract-full" style="display: none;"> Previous work on clinical relation extraction from free-text sentences leveraged information about semantic types from clinical knowledge bases as a part of entity representations. In this paper, we exploit additional evidence by also making use of domain-specific semantic type dependencies. We encode the relation between a span of tokens matching a Unified Medical Language System (UMLS) concept and other tokens in the sentence. We implement our method and compare against different named entity recognition (NER) architectures (i.e., BiLSTM-CRF and BiLSTM-GCN-CRF) using different pre-trained clinical embeddings (i.e., BERT, BioBERT, UMLSBert). Our experimental results on clinical datasets show that in some cases NER effectiveness can be significantly improved by making use of domain-specific semantic type dependencies. Our work is also the first study generating a matrix encoding to make use of more than three dependencies in one pass for the NER task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05373v1-abstract-full').style.display = 'none'; document.getElementById('2503.05373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> AMIA - American Medical Informatics Association 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05204">arXiv:2503.05204</a> <span> [<a href="https://arxiv.org/pdf/2503.05204">pdf</a>, <a href="https://arxiv.org/format/2503.05204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Data-Efficient Generalization for Zero-shot Composed Image Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zining Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhicheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+F">Fei Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoqin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05204v1-abstract-short" style="display: inline;"> Zero-shot Composed Image Retrieval (ZS-CIR) aims to retrieve the target image based on a reference image and a text description without requiring in-distribution triplets for training. One prevalent approach follows the vision-language pretraining paradigm that employs a mapping network to transfer the image embedding to a pseudo-word token in the text embedding space. However, this approach tends… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05204v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05204v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05204v1-abstract-full" style="display: none;"> Zero-shot Composed Image Retrieval (ZS-CIR) aims to retrieve the target image based on a reference image and a text description without requiring in-distribution triplets for training. One prevalent approach follows the vision-language pretraining paradigm that employs a mapping network to transfer the image embedding to a pseudo-word token in the text embedding space. However, this approach tends to impede network generalization due to modality discrepancy and distribution shift between training and inference. To this end, we propose a Data-efficient Generalization (DeG) framework, including two novel designs, namely, Textual Supplement (TS) module and Semantic-Set (S-Set). The TS module exploits compositional textual semantics during training, enhancing the pseudo-word token with more linguistic semantics and thus mitigating the modality discrepancy effectively. The S-Set exploits the zero-shot capability of pretrained Vision-Language Models (VLMs), alleviating the distribution shift and mitigating the overfitting issue from the redundancy of the large-scale image-text data. Extensive experiments over four ZS-CIR benchmarks show that DeG outperforms the state-of-the-art (SOTA) methods with much less training data, and saves substantial training and inference time for practical usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05204v1-abstract-full').style.display = 'none'; document.getElementById('2503.05204v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05139">arXiv:2503.05139</a> <span> [<a href="https://arxiv.org/pdf/2503.05139">pdf</a>, <a href="https://arxiv.org/format/2503.05139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Every FLOP Counts: Scaling a 300B Mixture-of-Experts LING LLM without Premium GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling+Team"> Ling Team</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Binwei Zeng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Changxin Tian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Cong Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Dingnan Jin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Feng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Feng Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+F">Feng Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fakang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gangshan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haitao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huizhong Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jia Liu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junpeng Fang</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Junjie Ou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jun Hu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Ji Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+J">Jian Sha</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+J">Jianxue Qian</a> , et al. (49 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05139v2-abstract-short" style="display: inline;"> In this technical report, we tackle the challenges of training large-scale Mixture of Experts (MoE) models, focusing on overcoming cost inefficiency and resource limitations prevalent in such systems. To address these issues, we present two differently sized MoE large language models (LLMs), namely Ling-Lite and Ling-Plus (referred to as "Bailing" in Chinese, spelled B菐il铆ng in Pinyin). Ling-Lite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05139v2-abstract-full').style.display = 'inline'; document.getElementById('2503.05139v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05139v2-abstract-full" style="display: none;"> In this technical report, we tackle the challenges of training large-scale Mixture of Experts (MoE) models, focusing on overcoming cost inefficiency and resource limitations prevalent in such systems. To address these issues, we present two differently sized MoE large language models (LLMs), namely Ling-Lite and Ling-Plus (referred to as "Bailing" in Chinese, spelled B菐il铆ng in Pinyin). Ling-Lite contains 16.8 billion parameters with 2.75 billion activated parameters, while Ling-Plus boasts 290 billion parameters with 28.8 billion activated parameters. Both models exhibit comparable performance to leading industry benchmarks. This report offers actionable insights to improve the efficiency and accessibility of AI development in resource-constrained settings, promoting more scalable and sustainable technologies. Specifically, to reduce training costs for large-scale MoE models, we propose innovative methods for (1) optimization of model architecture and training processes, (2) refinement of training anomaly handling, and (3) enhancement of model evaluation efficiency. Additionally, leveraging high-quality data generated from knowledge graphs, our models demonstrate superior capabilities in tool use compared to other models. Ultimately, our experimental findings demonstrate that a 300B MoE LLM can be effectively trained on lower-performance devices while achieving comparable performance to models of a similar scale, including dense and MoE models. Compared to high-performance devices, utilizing a lower-specification hardware system during the pre-training phase demonstrates significant cost savings, reducing computing costs by approximately 20%. The models can be accessed at https://huggingface.co/inclusionAI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05139v2-abstract-full').style.display = 'none'; document.getElementById('2503.05139v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.05057">arXiv:2503.05057</a> <span> [<a href="https://arxiv.org/pdf/2503.05057">pdf</a>, <a href="https://arxiv.org/format/2503.05057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Prismatic-Bending Transformable (PBT) Joint for a Modular, Foldable Manipulator with Enhanced Reachability and Dexterity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jianshu Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junda Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Boyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xin Ma</a>, <a href="/search/cs?searchtype=author&query=Tomizuka%2C+M">Masayoshi Tomizuka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.05057v1-abstract-short" style="display: inline;"> Robotic manipulators, traditionally designed with classical joint-link articulated structures, excel in industrial applications but face challenges in human-centered and general-purpose tasks requiring greater dexterity and adaptability. Addressing these limitations, we introduce the Prismatic-Bending Transformable (PBT) Joint, a novel design inspired by the scissors mechanism, enabling transforma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05057v1-abstract-full').style.display = 'inline'; document.getElementById('2503.05057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.05057v1-abstract-full" style="display: none;"> Robotic manipulators, traditionally designed with classical joint-link articulated structures, excel in industrial applications but face challenges in human-centered and general-purpose tasks requiring greater dexterity and adaptability. Addressing these limitations, we introduce the Prismatic-Bending Transformable (PBT) Joint, a novel design inspired by the scissors mechanism, enabling transformable kinematic chains. Each PBT joint module provides three degrees of freedom-bending, rotation, and elongation/contraction-allowing scalable and reconfigurable assemblies to form diverse kinematic configurations tailored to specific tasks. This innovative design surpasses conventional systems, delivering superior flexibility and performance across various applications. We present the design, modeling, and experimental validation of the PBT joint, demonstrating its integration into modular and foldable robotic arms. The PBT joint functions as a single SKU, enabling manipulators to be constructed entirely from standardized PBT joints without additional customized components. It also serves as a modular extension for existing systems, such as wrist modules, streamlining design, deployment, transportation, and maintenance. Three sizes-large, medium, and small-have been developed and integrated into robotic manipulators, highlighting their enhanced dexterity, reachability, and adaptability for manipulation tasks. This work represents a significant advancement in robotic design, offering scalable and efficient solutions for dynamic and unstructured environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.05057v1-abstract-full').style.display = 'none'; document.getElementById('2503.05057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04872">arXiv:2503.04872</a> <span> [<a href="https://arxiv.org/pdf/2503.04872">pdf</a>, <a href="https://arxiv.org/format/2503.04872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> TinyR1-32B-Preview: Boosting Accuracy with Branch-Merge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lin Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guangxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Jian%2C+X">Xiaoqi Jian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuhan Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weihong Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongfu Zhu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Change Jia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jinzhu Wu</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+J">Junfeng Ran</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sai-er Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zihan Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junting Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenrui Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+B">Bin Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangzheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04872v1-abstract-short" style="display: inline;"> The challenge of reducing the size of Large Language Models (LLMs) while maintaining their performance has gained significant attention. However, existing methods, such as model distillation and transfer learning, often fail to achieve high accuracy. To address this limitation, we introduce the Branch-Merge distillation approach, which enhances model compression through two phases: (1) the Branch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04872v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04872v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04872v1-abstract-full" style="display: none;"> The challenge of reducing the size of Large Language Models (LLMs) while maintaining their performance has gained significant attention. However, existing methods, such as model distillation and transfer learning, often fail to achieve high accuracy. To address this limitation, we introduce the Branch-Merge distillation approach, which enhances model compression through two phases: (1) the Branch Phase, where knowledge from a large teacher model is \textit{selectively distilled} into specialized student models via domain-specific supervised fine-tuning (SFT); And (2) the Merge Phase, where these student models are merged to enable cross-domain knowledge transfer and improve generalization. We validate our distillation approach using DeepSeek-R1 as the teacher and DeepSeek-R1-Distill-Qwen-32B as the student. The resulting merged model, TinyR1-32B-Preview, outperforms its counterpart DeepSeek-R1-Distill-Qwen-32B across multiple benchmarks, including Mathematics (+5.5 points), Coding (+4.4 points) and Science (+2.9 points), while achieving near-equal performance to DeepSeek-R1 on AIME 2024. The Branch-Merge distillation approach provides a scalable solution for creating smaller, high-performing LLMs with reduced computational cost and time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04872v1-abstract-full').style.display = 'none'; document.getElementById('2503.04872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04793">arXiv:2503.04793</a> <span> [<a href="https://arxiv.org/pdf/2503.04793">pdf</a>, <a href="https://arxiv.org/format/2503.04793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sentence-level Reward Model can Generalize Better for Aligning LLM from Human Preference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+W">Wenjie Qiu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yi-Chen Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuqin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yihang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zongzhang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04793v1-abstract-short" style="display: inline;"> Learning reward models from human preference datasets and subsequently optimizing language models via reinforcement learning has emerged as a fundamental paradigm for aligning LLMs with human preferences. The performance of the reward model plays a crucial role in the effectiveness of alignment. Previous reward models operate at a coarse-grained level, requiring the generation of a complete respon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04793v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04793v1-abstract-full" style="display: none;"> Learning reward models from human preference datasets and subsequently optimizing language models via reinforcement learning has emerged as a fundamental paradigm for aligning LLMs with human preferences. The performance of the reward model plays a crucial role in the effectiveness of alignment. Previous reward models operate at a coarse-grained level, requiring the generation of a complete response to obtain a reward value. The sparse reward may present challenges for downstream reinforcement learning. While recent efforts have attempted to learn token-level reward models, the lack of explicit semantic information makes it difficult to model the credit of every individual token. In this paper, we propose assigning scores to every sentence, introducing an intermediate-grained reward model. By segmenting the complete response into sentences and applying differential operations to reward output at the start and end positions of each sentence, we can effectively model the rewards of sentences. Moreover, a novel attention mechanism is introduced to aggregate the scores of all sentences into a response-level score, which allows it to be trained using the Bradley-Terry model. On common benchmarks, our method outperforms the response-level reward model by 2.7% on RewardBench (for reward modeling evaluation) and surpasses all baselines on AlpacaEval (for alignment evaluation). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04793v1-abstract-full').style.display = 'none'; document.getElementById('2503.04793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04715">arXiv:2503.04715</a> <span> [<a href="https://arxiv.org/pdf/2503.04715">pdf</a>, <a href="https://arxiv.org/format/2503.04715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large Language Model Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Houyi Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingcheng Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiufeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanshan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zili Wang</a>, <a href="/search/cs?searchtype=author&query=Xuyang%2C+S">Shijie Xuyang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuantao Fan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Daxin Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04715v2-abstract-short" style="display: inline;"> The impressive capabilities of Large Language Models (LLMs) across diverse tasks are now well-established, yet their effective deployment necessitates careful hyperparameter optimization. Through extensive empirical studies involving grid searches across diverse configurations, we discover universal scaling laws governing these hyperparameters: optimal learning rate follows a power-law relationshi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04715v2-abstract-full').style.display = 'inline'; document.getElementById('2503.04715v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04715v2-abstract-full" style="display: none;"> The impressive capabilities of Large Language Models (LLMs) across diverse tasks are now well-established, yet their effective deployment necessitates careful hyperparameter optimization. Through extensive empirical studies involving grid searches across diverse configurations, we discover universal scaling laws governing these hyperparameters: optimal learning rate follows a power-law relationship with both model parameters and data sizes, while optimal batch size scales primarily with data sizes. Our analysis reveals a convex optimization landscape for hyperparameters under fixed models and data size conditions. This convexity implies an optimal hyperparameter plateau. We contribute a universal, plug-and-play optimal hyperparameter tool for the community. Its estimated values on the test set are merely 0.09% away from the globally optimal LLM performance found via an exhaustive search. These laws demonstrate remarkable robustness across variations in model sparsity, training data distribution, and model shape. To our best known, this is the first work that unifies different model shapes and structures, such as Mixture-of-Experts models and dense transformers, as well as establishes optimal hyperparameter scaling laws across diverse data distributions. This exhaustive optimization process demands substantial computational resources, utilizing nearly one million NVIDIA H800 GPU hours to train 3,700 LLMs of varying sizes and hyperparameters from scratch and consuming approximately 100 trillion tokens in total. To facilitate reproducibility and further research, we will progressively release all loss measurements and model checkpoints through our designated repository https://step-law.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04715v2-abstract-full').style.display = 'none'; document.getElementById('2503.04715v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> F.2.2; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04619">arXiv:2503.04619</a> <span> [<a href="https://arxiv.org/pdf/2503.04619">pdf</a>, <a href="https://arxiv.org/format/2503.04619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SynGraph: A Dynamic Graph-LLM Synthesis Framework for Sparse Streaming User Sentiment Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Qiyu Wei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yingjie Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linhai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Deyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Ananiadou%2C+S">Sophia Ananiadou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04619v1-abstract-short" style="display: inline;"> User reviews on e-commerce platforms exhibit dynamic sentiment patterns driven by temporal and contextual factors. Traditional sentiment analysis methods focus on static reviews, failing to capture the evolving temporal relationship between user sentiment rating and textual content. Sentiment analysis on streaming reviews addresses this limitation by modeling and predicting the temporal evolution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04619v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04619v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04619v1-abstract-full" style="display: none;"> User reviews on e-commerce platforms exhibit dynamic sentiment patterns driven by temporal and contextual factors. Traditional sentiment analysis methods focus on static reviews, failing to capture the evolving temporal relationship between user sentiment rating and textual content. Sentiment analysis on streaming reviews addresses this limitation by modeling and predicting the temporal evolution of user sentiments. However, it suffers from data sparsity, manifesting in temporal, spatial, and combined forms. In this paper, we introduce SynGraph, a novel framework designed to address data sparsity in sentiment analysis on streaming reviews. SynGraph alleviates data sparsity by categorizing users into mid-tail, long-tail, and extreme scenarios and incorporating LLM-augmented enhancements within a dynamic graph-based structure. Experiments on real-world datasets demonstrate its effectiveness in addressing sparsity and improving sentiment modeling in streaming reviews. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04619v1-abstract-full').style.display = 'none'; document.getElementById('2503.04619v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 17 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04564">arXiv:2503.04564</a> <span> [<a href="https://arxiv.org/pdf/2503.04564">pdf</a>, <a href="https://arxiv.org/format/2503.04564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Fundamental Limits of Hierarchical Secure Aggregation with Cyclic User Association </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhou Li</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+K">Kai Wan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hua Sun</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+M">Mingyue Ji</a>, <a href="/search/cs?searchtype=author&query=Caire%2C+G">Giuseppe Caire</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04564v2-abstract-short" style="display: inline;"> Secure aggregation is motivated by federated learning (FL) where a cloud server aims to compute an averaged model (i.e., weights of deep neural networks) of the locally-trained models of numerous clients, while adhering to data security requirements. Hierarchical secure aggregation (HSA) extends this concept to a three-layer network, where clustered users communicate with the server through an int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04564v2-abstract-full').style.display = 'inline'; document.getElementById('2503.04564v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04564v2-abstract-full" style="display: none;"> Secure aggregation is motivated by federated learning (FL) where a cloud server aims to compute an averaged model (i.e., weights of deep neural networks) of the locally-trained models of numerous clients, while adhering to data security requirements. Hierarchical secure aggregation (HSA) extends this concept to a three-layer network, where clustered users communicate with the server through an intermediate layer of relays. In HSA, beyond conventional server security, relay security is also enforced to ensure that the relays remain oblivious to the users' inputs (an abstraction of the local models in FL). Existing study on HSA assumes that each user is associated with only one relay, limiting opportunities for coding across inter-cluster users to achieve efficient communication and key generation. In this paper, we consider HSA with a cyclic association pattern where each user is connected to $B$ consecutive relays in a wrap-around manner. We propose an efficient aggregation scheme which includes a message design for the inputs inspired by gradient coding-a well-known technique for efficient communication in distributed computing-along with a highly nontrivial security key design. We also derive novel converse bounds on the minimum achievable communication and key rates using information-theoretic arguments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04564v2-abstract-full').style.display = 'none'; document.getElementById('2503.04564v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04550">arXiv:2503.04550</a> <span> [<a href="https://arxiv.org/pdf/2503.04550">pdf</a>, <a href="https://arxiv.org/format/2503.04550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Reasoning Robustness in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Y">Yongcheng Jing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xikun Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wentao Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjie Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingjie Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenbin Hu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04550v1-abstract-short" style="display: inline;"> Despite the recent success of large language models (LLMs) in reasoning such as DeepSeek, we for the first time identify a key dilemma in reasoning robustness and generalization: significant performance degradation on novel or incomplete data, suggesting a reliance on memorized patterns rather than systematic reasoning. Our closer examination reveals four key unique limitations underlying this iss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04550v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04550v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04550v1-abstract-full" style="display: none;"> Despite the recent success of large language models (LLMs) in reasoning such as DeepSeek, we for the first time identify a key dilemma in reasoning robustness and generalization: significant performance degradation on novel or incomplete data, suggesting a reliance on memorized patterns rather than systematic reasoning. Our closer examination reveals four key unique limitations underlying this issue:(1) Positional bias--models favor earlier queries in multi-query inputs but answering the wrong one in the latter (e.g., GPT-4o's accuracy drops from 75.8 percent to 72.8 percent); (2) Instruction sensitivity--performance declines by 5.0 to 7.5 percent in the Qwen2.5 Series and by 5.0 percent in DeepSeek-V3 with auxiliary guidance; (3) Numerical fragility--value substitution sharply reduces accuracy (e.g., GPT-4o drops from 97.5 percent to 82.5 percent, GPT-o1-mini drops from 97.5 percent to 92.5 percent); and (4) Memory dependence--models resort to guesswork when missing critical data. These findings further highlight the reliance on heuristic recall over rigorous logical inference, demonstrating challenges in reasoning robustness. To comprehensively investigate these robustness challenges, this paper introduces a novel benchmark, termed as Math-RoB, that exploits hallucinations triggered by missing information to expose reasoning gaps. This is achieved by an instruction-based approach to generate diverse datasets that closely resemble training distributions, facilitating a holistic robustness assessment and advancing the development of more robust reasoning frameworks. Bad character(s) in field Abstract. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04550v1-abstract-full').style.display = 'none'; document.getElementById('2503.04550v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04517">arXiv:2503.04517</a> <span> [<a href="https://arxiv.org/pdf/2503.04517">pdf</a>, <a href="https://arxiv.org/format/2503.04517">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> </div> </div> <p class="title is-5 mathjax"> Succinct Perfect Zero-knowledge for MIP* </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+H">Honghao Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingjian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04517v1-abstract-short" style="display: inline;"> In the recent breakthrough result of Slofstra and Mastel (STOC'24), they show that there is a two-player one-round perfect zero-knowledge MIP* protocol for RE. We build on their result to show that there exists a succinct two-player one-round perfect zero-knowledge MIP* protocol for RE with polylog question size and O(1) answer size, or with O(1) question size and polylog answer size. To prove our… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04517v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04517v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04517v1-abstract-full" style="display: none;"> In the recent breakthrough result of Slofstra and Mastel (STOC'24), they show that there is a two-player one-round perfect zero-knowledge MIP* protocol for RE. We build on their result to show that there exists a succinct two-player one-round perfect zero-knowledge MIP* protocol for RE with polylog question size and O(1) answer size, or with O(1) question size and polylog answer size. To prove our result, we analyze the four central compression techniques underlying the MIP*= RE proof (Ji et al. '20) -- question reduction, oracularization, answer reduction, and parallel repetition -- and show that they all preserve the perfect (as well as statistical and computational) zero-knowledge properties of the original protocol. Furthermore, we complete the study of the conversion between constraint-constraint and constraint-variable binary constraint system (BCS) nonlocal games, which provide a quantum information characterization of MIP* protocols. While Paddock (QIP'23) established that any near-perfect strategy for a constraint-variable game can be mapped to a constraint-constraint version, we prove the converse, fully establishing their equivalence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04517v1-abstract-full').style.display = 'none'; document.getElementById('2503.04517v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04473">arXiv:2503.04473</a> <span> [<a href="https://arxiv.org/pdf/2503.04473">pdf</a>, <a href="https://arxiv.org/format/2503.04473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Runtime Backdoor Detection for Federated Learning via Representational Dissimilarity Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiyue Zhang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+X">Xiaoyong Xue</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaoning Du</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xiaofei Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Meng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04473v1-abstract-short" style="display: inline;"> Federated learning (FL), as a powerful learning paradigm, trains a shared model by aggregating model updates from distributed clients. However, the decoupling of model learning from local data makes FL highly vulnerable to backdoor attacks, where a single compromised client can poison the shared model. While recent progress has been made in backdoor detection, existing methods face challenges with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04473v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04473v1-abstract-full" style="display: none;"> Federated learning (FL), as a powerful learning paradigm, trains a shared model by aggregating model updates from distributed clients. However, the decoupling of model learning from local data makes FL highly vulnerable to backdoor attacks, where a single compromised client can poison the shared model. While recent progress has been made in backdoor detection, existing methods face challenges with detection accuracy and runtime effectiveness, particularly when dealing with complex model architectures. In this work, we propose a novel approach to detecting malicious clients in an accurate, stable, and efficient manner. Our method utilizes a sampling-based network representation method to quantify dissimilarities between clients, identifying model deviations caused by backdoor injections. We also propose an iterative algorithm to progressively detect and exclude malicious clients as outliers based on these dissimilarity measurements. Evaluations across a range of benchmark tasks demonstrate that our approach outperforms state-of-the-art methods in detection accuracy and defense effectiveness. When deployed for runtime protection, our approach effectively eliminates backdoor injections with marginal overheads. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04473v1-abstract-full').style.display = 'none'; document.getElementById('2503.04473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE Transactions on Dependable and Secure Computing (TDSC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04452">arXiv:2503.04452</a> <span> [<a href="https://arxiv.org/pdf/2503.04452">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A lightweight model FDM-YOLO for small target improvement based on YOLOv8 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuerui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04452v1-abstract-short" style="display: inline;"> Small targets are particularly difficult to detect due to their low pixel count, complex backgrounds, and varying shooting angles, which make it hard for models to extract effective features. While some large-scale models offer high accuracy, their long inference times make them unsuitable for real-time deployment on edge devices. On the other hand, models designed for low computational power ofte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04452v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04452v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04452v1-abstract-full" style="display: none;"> Small targets are particularly difficult to detect due to their low pixel count, complex backgrounds, and varying shooting angles, which make it hard for models to extract effective features. While some large-scale models offer high accuracy, their long inference times make them unsuitable for real-time deployment on edge devices. On the other hand, models designed for low computational power often suffer from poor detection accuracy. This paper focuses on small target detection and explores methods for object detection under low computational constraints. Building on the YOLOv8 model, we propose a new network architecture called FDM-YOLO. Our research includes the following key contributions: We introduce FDM-YOLO by analyzing the output of the YOLOv8 detection head. We add a highresolution layer and remove the large target detection layer to better handle small targets. Based on PConv, we propose a lightweight network structure called Fast-C2f, which is integrated into the PAN module of the model. To mitigate the accuracy loss caused by model lightweighting, we employ dynamic upsampling (Dysample) and a lightweight EMA attention mechanism.The FDM-YOLO model was validated on the Visdrone dataset, achieving a 38% reduction in parameter count and improving the Map0.5 score from 38.4% to 42.5%, all while maintaining nearly the same inference speed. This demonstrates the effectiveness of our approach in balancing accuracy and efficiency for edge device deployment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04452v1-abstract-full').style.display = 'none'; document.getElementById('2503.04452v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04240">arXiv:2503.04240</a> <span> [<a href="https://arxiv.org/pdf/2503.04240">pdf</a>, <a href="https://arxiv.org/format/2503.04240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DiffPO: Diffusion-styled Preference Optimization for Efficient Inference-Time Alignment of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruizhe Chen</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhifei Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaotian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J+T">Joey Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Quek%2C+T">Tony Quek</a>, <a href="/search/cs?searchtype=author&query=Poria%2C+S">Soujanya Poria</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuozhu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04240v2-abstract-short" style="display: inline;"> Inference-time alignment provides an efficient alternative for aligning LLMs with humans. However, these approaches still face challenges, such as limited scalability due to policy-specific value functions and latency during the inference phase. In this paper, we propose a novel approach, Diffusion-styled Preference Optimization (\model), which provides an efficient and policy-agnostic solution fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04240v2-abstract-full').style.display = 'inline'; document.getElementById('2503.04240v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04240v2-abstract-full" style="display: none;"> Inference-time alignment provides an efficient alternative for aligning LLMs with humans. However, these approaches still face challenges, such as limited scalability due to policy-specific value functions and latency during the inference phase. In this paper, we propose a novel approach, Diffusion-styled Preference Optimization (\model), which provides an efficient and policy-agnostic solution for aligning LLMs with humans. By directly performing alignment at sentence level, \model~avoids the time latency associated with token-level generation. Designed as a plug-and-play module, \model~can be seamlessly integrated with various base models to enhance their alignment. Extensive experiments on AlpacaEval 2, MT-bench, and HH-RLHF demonstrate that \model~achieves superior alignment performance across various settings, achieving a favorable trade-off between alignment quality and inference-time latency. Furthermore, \model~demonstrates model-agnostic scalability, significantly improving the performance of large models such as Llama-3-70B. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04240v2-abstract-full').style.display = 'none'; document.getElementById('2503.04240v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04162">arXiv:2503.04162</a> <span> [<a href="https://arxiv.org/pdf/2503.04162">pdf</a>, <a href="https://arxiv.org/format/2503.04162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Semantic Retrieval Augmented Contrastive Learning for Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Ziqiang Cui</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yunpeng Weng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xing Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokun Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dugang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiwei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiyang Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bowei He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weihong Luo</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiuqiang He</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chen Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04162v1-abstract-short" style="display: inline;"> Sequential recommendation aims to model user preferences based on historical behavior sequences, which is crucial for various online platforms. Data sparsity remains a significant challenge in this area as most users have limited interactions and many items receive little attention. To mitigate this issue, contrastive learning has been widely adopted. By constructing positive sample pairs from the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04162v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04162v1-abstract-full" style="display: none;"> Sequential recommendation aims to model user preferences based on historical behavior sequences, which is crucial for various online platforms. Data sparsity remains a significant challenge in this area as most users have limited interactions and many items receive little attention. To mitigate this issue, contrastive learning has been widely adopted. By constructing positive sample pairs from the data itself and maximizing their agreement in the embedding space,it can leverage available data more effectively. Constructing reasonable positive sample pairs is crucial for the success of contrastive learning. However, current approaches struggle to generate reliable positive pairs as they either rely on representations learned from inherently sparse collaborative signals or use random perturbations which introduce significant uncertainty. To address these limitations, we propose a novel approach named Semantic Retrieval Augmented Contrastive Learning (SRA-CL), which leverages semantic information to improve the reliability of contrastive samples. SRA-CL comprises two main components: (1) Cross-Sequence Contrastive Learning via User Semantic Retrieval, which utilizes large language models (LLMs) to understand diverse user preferences and retrieve semantically similar users to form reliable positive samples through a learnable sample synthesis method; and (2) Intra-Sequence Contrastive Learning via Item Semantic Retrieval, which employs LLMs to comprehend items and retrieve similar items to perform semantic-based item substitution, thereby creating semantically consistent augmented views for contrastive learning. SRA-CL is plug-and-play and can be integrated into standard sequential recommendation models. Extensive experiments on four public datasets demonstrate the effectiveness and generalizability of the proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04162v1-abstract-full').style.display = 'none'; document.getElementById('2503.04162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.04135">arXiv:2503.04135</a> <span> [<a href="https://arxiv.org/pdf/2503.04135">pdf</a>, <a href="https://arxiv.org/format/2503.04135">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Biological Sequence with Language Model Prompting: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiyue Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zikang Wang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Y">Yuheng Shan</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+H">Heyan Chai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zixian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.04135v1-abstract-short" style="display: inline;"> Large Language models (LLMs) have emerged as powerful tools for addressing challenges across diverse domains. Notably, recent studies have demonstrated that large language models significantly enhance the efficiency of biomolecular analysis and synthesis, attracting widespread attention from academics and medicine. In this paper, we systematically investigate the application of prompt-based method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04135v1-abstract-full').style.display = 'inline'; document.getElementById('2503.04135v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.04135v1-abstract-full" style="display: none;"> Large Language models (LLMs) have emerged as powerful tools for addressing challenges across diverse domains. Notably, recent studies have demonstrated that large language models significantly enhance the efficiency of biomolecular analysis and synthesis, attracting widespread attention from academics and medicine. In this paper, we systematically investigate the application of prompt-based methods with LLMs to biological sequences, including DNA, RNA, proteins, and drug discovery tasks. Specifically, we focus on how prompt engineering enables LLMs to tackle domain-specific problems, such as promoter sequence prediction, protein structure modeling, and drug-target binding affinity prediction, often with limited labeled data. Furthermore, our discussion highlights the transformative potential of prompting in bioinformatics while addressing key challenges such as data scarcity, multimodal fusion, and computational resource limitations. Our aim is for this paper to function both as a foundational primer for newcomers and a catalyst for continued innovation within this dynamic field of study. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.04135v1-abstract-full').style.display = 'none'; document.getElementById('2503.04135v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03803">arXiv:2503.03803</a> <span> [<a href="https://arxiv.org/pdf/2503.03803">pdf</a>, <a href="https://arxiv.org/format/2503.03803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EgoLife: Towards Egocentric Life Assistant </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jingkang Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuai Liu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongming Guo</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiamengwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zitang Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Binzhu Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyue Wang</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+B">Bei Ouyang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhengyu Lin</a>, <a href="/search/cs?searchtype=author&query=Cominelli%2C+M">Marco Cominelli</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zhongang Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+F">Fangzhou Hong</a>, <a href="/search/cs?searchtype=author&query=Widmer%2C+J">Joerg Widmer</a>, <a href="/search/cs?searchtype=author&query=Gringoli%2C+F">Francesco Gringoli</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03803v1-abstract-short" style="display: inline;"> We introduce EgoLife, a project to develop an egocentric life assistant that accompanies and enhances personal efficiency through AI-powered wearable glasses. To lay the foundation for this assistant, we conducted a comprehensive data collection study where six participants lived together for one week, continuously recording their daily activities - including discussions, shopping, cooking, social… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03803v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03803v1-abstract-full" style="display: none;"> We introduce EgoLife, a project to develop an egocentric life assistant that accompanies and enhances personal efficiency through AI-powered wearable glasses. To lay the foundation for this assistant, we conducted a comprehensive data collection study where six participants lived together for one week, continuously recording their daily activities - including discussions, shopping, cooking, socializing, and entertainment - using AI glasses for multimodal egocentric video capture, along with synchronized third-person-view video references. This effort resulted in the EgoLife Dataset, a comprehensive 300-hour egocentric, interpersonal, multiview, and multimodal daily life dataset with intensive annotation. Leveraging this dataset, we introduce EgoLifeQA, a suite of long-context, life-oriented question-answering tasks designed to provide meaningful assistance in daily life by addressing practical questions such as recalling past relevant events, monitoring health habits, and offering personalized recommendations. To address the key technical challenges of (1) developing robust visual-audio models for egocentric data, (2) enabling identity recognition, and (3) facilitating long-context question answering over extensive temporal information, we introduce EgoButler, an integrated system comprising EgoGPT and EgoRAG. EgoGPT is an omni-modal model trained on egocentric datasets, achieving state-of-the-art performance on egocentric video understanding. EgoRAG is a retrieval-based component that supports answering ultra-long-context questions. Our experimental studies verify their working mechanisms and reveal critical factors and bottlenecks, guiding future improvements. By releasing our datasets, models, and benchmarks, we aim to stimulate further research in egocentric AI assistants. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03803v1-abstract-full').style.display = 'none'; document.getElementById('2503.03803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025. Project Page: https://egolife-ai.github.io/. Code: https://github.com/EvolvingLMMs-Lab/EgoLife</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03746">arXiv:2503.03746</a> <span> [<a href="https://arxiv.org/pdf/2503.03746">pdf</a>, <a href="https://arxiv.org/format/2503.03746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Process-based Self-Rewarding Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shimao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junxiao Liu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zheheng Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shujian Huang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03746v1-abstract-short" style="display: inline;"> Large Language Models have demonstrated outstanding performance across various downstream tasks and have been widely applied in multiple scenarios. Human-annotated preference data is used for training to further improve LLMs' performance, which is constrained by the upper limit of human performance. Therefore, Self-Rewarding method has been proposed, where LLMs generate training data by rewarding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03746v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03746v1-abstract-full" style="display: none;"> Large Language Models have demonstrated outstanding performance across various downstream tasks and have been widely applied in multiple scenarios. Human-annotated preference data is used for training to further improve LLMs' performance, which is constrained by the upper limit of human performance. Therefore, Self-Rewarding method has been proposed, where LLMs generate training data by rewarding their own outputs. However, the existing self-rewarding paradigm is not effective in mathematical reasoning scenarios and may even lead to a decline in performance. In this work, we propose the Process-based Self-Rewarding pipeline for language models, which introduces long-thought reasoning, step-wise LLM-as-a-Judge, and step-wise preference optimization within the self-rewarding paradigm. Our new paradigm successfully enhances the performance of LLMs on multiple mathematical reasoning benchmarks through iterative Process-based Self-Rewarding, demonstrating the immense potential of self-rewarding to achieve LLM reasoning that may surpass human capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03746v1-abstract-full').style.display = 'none'; document.getElementById('2503.03746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03367">arXiv:2503.03367</a> <span> [<a href="https://arxiv.org/pdf/2503.03367">pdf</a>, <a href="https://arxiv.org/format/2503.03367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Top-K Maximum Intensity Projection Priors for 3D Liver Vessel Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaotong Zhang</a>, <a href="/search/cs?searchtype=author&query=Broersen%2C+A">Alexander Broersen</a>, <a href="/search/cs?searchtype=author&query=van+Erp%2C+G+C">Gonnie CM van Erp</a>, <a href="/search/cs?searchtype=author&query=Pintea%2C+S+L">Silvia L. Pintea</a>, <a href="/search/cs?searchtype=author&query=Dijkstra%2C+J">Jouke Dijkstra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03367v1-abstract-short" style="display: inline;"> Liver-vessel segmentation is an essential task in the pre-operative planning of liver resection. State-of-the-art 2D or 3D convolution-based methods focusing on liver vessel segmentation on 2D CT cross-sectional views, which do not take into account the global liver-vessel topology. To maintain this global vessel topology, we rely on the underlying physics used in the CT reconstruction process, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03367v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03367v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03367v1-abstract-full" style="display: none;"> Liver-vessel segmentation is an essential task in the pre-operative planning of liver resection. State-of-the-art 2D or 3D convolution-based methods focusing on liver vessel segmentation on 2D CT cross-sectional views, which do not take into account the global liver-vessel topology. To maintain this global vessel topology, we rely on the underlying physics used in the CT reconstruction process, and apply this to liver-vessel segmentation. Concretely, we introduce the concept of top-k maximum intensity projections, which mimics the CT reconstruction by replacing the integral along each projection direction, with keeping the top-k maxima along each projection direction. We use these top-k maximum projections to condition a diffusion model and generate 3D liver-vessel trees. We evaluate our 3D liver-vessel segmentation on the 3D-ircadb-01 dataset, and achieve the highest Dice coefficient, intersection-over-union (IoU), and Sensitivity scores compared to prior work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03367v1-abstract-full').style.display = 'none'; document.getElementById('2503.03367v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in 2025 IEEE International Symposium on Biomedical Imaging (ISBI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03329">arXiv:2503.03329</a> <span> [<a href="https://arxiv.org/pdf/2503.03329">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Based Diffusion MRI Tractography: Integrating Spatial and Anatomical Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiqiong Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yitian Yuan</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+B">Baoxing Ren</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Ye Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yanqiu Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyuan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03329v1-abstract-short" style="display: inline;"> Diffusion MRI tractography technique enables non-invasive visualization of the white matter pathways in the brain. It plays a crucial role in neuroscience and clinical fields by facilitating the study of brain connectivity and neurological disorders. However, the accuracy of reconstructed tractograms has been a longstanding challenge. Recently, deep learning methods have been applied to improve tr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03329v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03329v1-abstract-full" style="display: none;"> Diffusion MRI tractography technique enables non-invasive visualization of the white matter pathways in the brain. It plays a crucial role in neuroscience and clinical fields by facilitating the study of brain connectivity and neurological disorders. However, the accuracy of reconstructed tractograms has been a longstanding challenge. Recently, deep learning methods have been applied to improve tractograms for better white matter coverage, but often comes at the expense of generating excessive false-positive connections. This is largely due to their reliance on local information to predict long range streamlines. To improve the accuracy of streamline propagation predictions, we introduce a novel deep learning framework that integrates image-domain spatial information and anatomical information along tracts, with the former extracted through convolutional layers and the later modeled via a Transformer-decoder. Additionally, we employ a weighted loss function to address fiber class imbalance encountered during training. We evaluate the proposed method on the simulated ISMRM 2015 Tractography Challenge dataset, achieving a valid streamline rate of 66.2%, white matter coverage of 63.8%, and successfully reconstructing 24 out of 25 bundles. Furthermore, on the multi-site Tractoinferno dataset, the proposed method demonstrates its ability to handle various diffusion MRI acquisition schemes, achieving a 5.7% increase in white matter coverage and a 4.1% decrease in overreach compared to RNN-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03329v1-abstract-full').style.display = 'none'; document.getElementById('2503.03329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03294">arXiv:2503.03294</a> <span> [<a href="https://arxiv.org/pdf/2503.03294">pdf</a>, <a href="https://arxiv.org/format/2503.03294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Segmentation and Report Generation for CT Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yannian Gu</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+W">Wenhui Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03294v1-abstract-short" style="display: inline;"> Automated CT report generation plays a crucial role in improving diagnostic accuracy and clinical workflow efficiency. However, existing methods lack interpretability and impede patient-clinician understanding, while their static nature restricts radiologists from dynamically adjusting assessments during image review. Inspired by interactive segmentation techniques, we propose a novel interactive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03294v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03294v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03294v1-abstract-full" style="display: none;"> Automated CT report generation plays a crucial role in improving diagnostic accuracy and clinical workflow efficiency. However, existing methods lack interpretability and impede patient-clinician understanding, while their static nature restricts radiologists from dynamically adjusting assessments during image review. Inspired by interactive segmentation techniques, we propose a novel interactive framework for 3D lesion morphology reporting that seamlessly generates segmentation masks with comprehensive attribute descriptions, enabling clinicians to generate detailed lesion profiles for enhanced diagnostic assessment. To our best knowledge, we are the first to integrate the interactive segmentation and structured reports in 3D CT medical images. Experimental results across 15 lesion types demonstrate the effectiveness of our approach in providing a more comprehensive and reliable reporting system for lesion segmentation and capturing. The source code will be made publicly available following paper acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03294v1-abstract-full').style.display = 'none'; document.getElementById('2503.03294v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03265">arXiv:2503.03265</a> <span> [<a href="https://arxiv.org/pdf/2503.03265">pdf</a>, <a href="https://arxiv.org/format/2503.03265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Optimizing for the Shortest Path in Denoising Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+P">Ping Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huan Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Min Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yanlin Qian</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+S">Shiguo Lian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03265v2-abstract-short" style="display: inline;"> In this research, we propose a novel denoising diffusion model based on shortest-path modeling that optimizes residual propagation to enhance both denoising efficiency and quality. Drawing on Denoising Diffusion Implicit Models (DDIM) and insights from graph theory, our model, termed the Shortest Path Diffusion Model (ShortDF), treats the denoising process as a shortest-path problem aimed at minim… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03265v2-abstract-full').style.display = 'inline'; document.getElementById('2503.03265v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03265v2-abstract-full" style="display: none;"> In this research, we propose a novel denoising diffusion model based on shortest-path modeling that optimizes residual propagation to enhance both denoising efficiency and quality. Drawing on Denoising Diffusion Implicit Models (DDIM) and insights from graph theory, our model, termed the Shortest Path Diffusion Model (ShortDF), treats the denoising process as a shortest-path problem aimed at minimizing reconstruction error. By optimizing the initial residuals, we improve the efficiency of the reverse diffusion process and the quality of the generated samples. Extensive experiments on multiple standard benchmarks demonstrate that ShortDF significantly reduces diffusion time (or steps) while enhancing the visual fidelity of generated samples compared to prior arts. This work, we suppose, paves the way for interactive diffusion-based applications and establishes a foundation for rapid data generation. Code is available at https://github.com/UnicomAI/ShortDF <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03265v2-abstract-full').style.display = 'none'; document.getElementById('2503.03265v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepet by CVPR 2025 (10 pages, 6 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03211">arXiv:2503.03211</a> <span> [<a href="https://arxiv.org/pdf/2503.03211">pdf</a>, <a href="https://arxiv.org/format/2503.03211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NodeReg: Mitigating the Imbalance and Distribution Shift Effects in Semi-Supervised Node Classification via Norm Consistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shenzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingbo Zhou</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+X">Xingkai Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaofang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03211v1-abstract-short" style="display: inline;"> Aggregating information from neighboring nodes benefits graph neural networks (GNNs) in semi-supervised node classification tasks. Nevertheless, this mechanism also renders nodes susceptible to the influence of their neighbors. For instance, this will occur when the neighboring nodes are imbalanced or the neighboring nodes contain noise, which can even affect the GNN's ability to generalize out of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03211v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03211v1-abstract-full" style="display: none;"> Aggregating information from neighboring nodes benefits graph neural networks (GNNs) in semi-supervised node classification tasks. Nevertheless, this mechanism also renders nodes susceptible to the influence of their neighbors. For instance, this will occur when the neighboring nodes are imbalanced or the neighboring nodes contain noise, which can even affect the GNN's ability to generalize out of distribution. We find that ensuring the consistency of the norm for node representations can significantly reduce the impact of these two issues on GNNs. To this end, we propose a regularized optimization method called NodeReg that enforces the consistency of node representation norms. This method is simple but effective and satisfies Lipschitz continuity, thus facilitating stable optimization and significantly improving semi-supervised node classification performance under the above two scenarios. To illustrate, in the imbalance scenario, when training a GCN with an imbalance ratio of 0.1, NodeReg outperforms the most competitive baselines by 1.4%-25.9% in F1 score across five public datasets. Similarly, in the distribution shift scenario, NodeReg outperforms the most competitive baseline by 1.4%-3.1% in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03211v1-abstract-full').style.display = 'none'; document.getElementById('2503.03211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository