Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 747 results for author: <span class="mathjax">Zhou, L</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhou%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhou, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhou%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhou, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.23029">arXiv:2503.23029</a> <span> [<a href="https://arxiv.org/pdf/2503.23029">pdf</a>, <a href="https://arxiv.org/format/2503.23029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Retrieval-Augmented Knowledge Mining Method with Deep Thinking LLMs for Biomedical Research and Clinical Support </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yichun Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiawei Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ruikun He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixue Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.23029v1-abstract-short" style="display: inline;"> Knowledge graphs and large language models (LLMs) are key tools for biomedical knowledge integration and reasoning, facilitating structured organization of scientific articles and discovery of complex semantic relationships. However, current methods face challenges: knowledge graph construction is limited by complex terminology, data heterogeneity, and rapid knowledge evolution, while LLMs show li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23029v1-abstract-full').style.display = 'inline'; document.getElementById('2503.23029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.23029v1-abstract-full" style="display: none;"> Knowledge graphs and large language models (LLMs) are key tools for biomedical knowledge integration and reasoning, facilitating structured organization of scientific articles and discovery of complex semantic relationships. However, current methods face challenges: knowledge graph construction is limited by complex terminology, data heterogeneity, and rapid knowledge evolution, while LLMs show limitations in retrieval and reasoning, making it difficult to uncover cross-document associations and reasoning pathways. To address these issues, we propose a pipeline that uses LLMs to construct a biomedical knowledge graph (BioStrataKG) from large-scale articles and builds a cross-document question-answering dataset (BioCDQA) to evaluate latent knowledge retrieval and multi-hop reasoning. We then introduce Integrated and Progressive Retrieval-Augmented Reasoning (IP-RAR) to enhance retrieval accuracy and knowledge reasoning. IP-RAR maximizes information recall through Integrated Reasoning-based Retrieval and refines knowledge via Progressive Reasoning-based Generation, using self-reflection to achieve deep thinking and precise contextual understanding. Experiments show that IP-RAR improves document retrieval F1 score by 20\% and answer generation accuracy by 25\% over existing methods. This framework helps doctors efficiently integrate treatment evidence for personalized medication plans and enables researchers to analyze advancements and research gaps, accelerating scientific discovery and decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.23029v1-abstract-full').style.display = 'none'; document.getElementById('2503.23029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.22231">arXiv:2503.22231</a> <span> [<a href="https://arxiv.org/pdf/2503.22231">pdf</a>, <a href="https://arxiv.org/format/2503.22231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoGen: 3D Consistent Video Generation via Adaptive Conditioning for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yishen Ji</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Ziyue Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhenxin Zhu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+K">Kaixin Xiong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+M">Ming Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lijun Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bing Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+T">Tong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.22231v1-abstract-short" style="display: inline;"> Recent progress in driving video generation has shown significant potential for enhancing self-driving systems by providing scalable and controllable training data. Although pretrained state-of-the-art generation models, guided by 2D layout conditions (e.g., HD maps and bounding boxes), can produce photorealistic driving videos, achieving controllable multi-view videos with high 3D consistency rem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22231v1-abstract-full').style.display = 'inline'; document.getElementById('2503.22231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.22231v1-abstract-full" style="display: none;"> Recent progress in driving video generation has shown significant potential for enhancing self-driving systems by providing scalable and controllable training data. Although pretrained state-of-the-art generation models, guided by 2D layout conditions (e.g., HD maps and bounding boxes), can produce photorealistic driving videos, achieving controllable multi-view videos with high 3D consistency remains a major challenge. To tackle this, we introduce a novel spatial adaptive generation framework, CoGen, which leverages advances in 3D generation to improve performance in two key aspects: (i) To ensure 3D consistency, we first generate high-quality, controllable 3D conditions that capture the geometry of driving scenes. By replacing coarse 2D conditions with these fine-grained 3D representations, our approach significantly enhances the spatial consistency of the generated videos. (ii) Additionally, we introduce a consistency adapter module to strengthen the robustness of the model to multi-condition control. The results demonstrate that this method excels in preserving geometric fidelity and visual realism, offering a reliable video generation solution for autonomous driving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.22231v1-abstract-full').style.display = 'none'; document.getElementById('2503.22231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.18536">arXiv:2503.18536</a> <span> [<a href="https://arxiv.org/pdf/2503.18536">pdf</a>, <a href="https://arxiv.org/format/2503.18536">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiN: Diffusion Model for Robust Medical VQA with Semantic Noisy Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+E">Erjian Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.18536v1-abstract-short" style="display: inline;"> Medical Visual Question Answering (Med-VQA) systems benefit the interpretation of medical images containing critical clinical information. However, the challenge of noisy labels and limited high-quality datasets remains underexplored. To address this, we establish the first benchmark for noisy labels in Med-VQA by simulating human mislabeling with semantically designed noise types. More importantl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18536v1-abstract-full').style.display = 'inline'; document.getElementById('2503.18536v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.18536v1-abstract-full" style="display: none;"> Medical Visual Question Answering (Med-VQA) systems benefit the interpretation of medical images containing critical clinical information. However, the challenge of noisy labels and limited high-quality datasets remains underexplored. To address this, we establish the first benchmark for noisy labels in Med-VQA by simulating human mislabeling with semantically designed noise types. More importantly, we introduce the DiN framework, which leverages a diffusion model to handle noisy labels in Med-VQA. Unlike the dominant classification-based VQA approaches that directly predict answers, our Answer Diffuser (AD) module employs a coarse-to-fine process, refining answer candidates with a diffusion model for improved accuracy. The Answer Condition Generator (ACG) further enhances this process by generating task-specific conditional information via integrating answer embeddings with fused image-question features. To address label noise, our Noisy Label Refinement(NLR) module introduces a robust loss function and dynamic answer adjustment to further boost the performance of the AD module. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.18536v1-abstract-full').style.display = 'none'; document.getElementById('2503.18536v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.15404">arXiv:2503.15404</a> <span> [<a href="https://arxiv.org/pdf/2503.15404">pdf</a>, <a href="https://arxiv.org/format/2503.15404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Improving Adversarial Transferability on Vision Transformers via Forward Propagation Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuchen Ren</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhe Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.15404v1-abstract-short" style="display: inline;"> Vision Transformers (ViTs) have been widely applied in various computer vision and vision-language tasks. To gain insights into their robustness in practical scenarios, transferable adversarial examples on ViTs have been extensively studied. A typical approach to improving adversarial transferability is by refining the surrogate model. However, existing work on ViTs has restricted their surrogate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15404v1-abstract-full').style.display = 'inline'; document.getElementById('2503.15404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.15404v1-abstract-full" style="display: none;"> Vision Transformers (ViTs) have been widely applied in various computer vision and vision-language tasks. To gain insights into their robustness in practical scenarios, transferable adversarial examples on ViTs have been extensively studied. A typical approach to improving adversarial transferability is by refining the surrogate model. However, existing work on ViTs has restricted their surrogate refinement to backward propagation. In this work, we instead focus on Forward Propagation Refinement (FPR) and specifically refine two key modules of ViTs: attention maps and token embeddings. For attention maps, we propose Attention Map Diversification (AMD), which diversifies certain attention maps and also implicitly imposes beneficial gradient vanishing during backward propagation. For token embeddings, we propose Momentum Token Embedding (MTE), which accumulates historical token embeddings to stabilize the forward updates in both the Attention and MLP blocks. We conduct extensive experiments with adversarial examples transferred from ViTs to various CNNs and ViTs, demonstrating that our FPR outperforms the current best (backward) surrogate refinement by up to 7.0\% on average. We also validate its superiority against popular defenses and its compatibility with other transfer methods. Codes and appendix are available at https://github.com/RYC-98/FPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.15404v1-abstract-full').style.display = 'none'; document.getElementById('2503.15404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.12006">arXiv:2503.12006</a> <span> [<a href="https://arxiv.org/pdf/2503.12006">pdf</a>, <a href="https://arxiv.org/format/2503.12006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ROS-SAM: High-Quality Interactive Segmentation for Remote Sensing Moving Object </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zhe Shan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lei Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cheng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xia Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.12006v1-abstract-short" style="display: inline;"> The availability of large-scale remote sensing video data underscores the importance of high-quality interactive segmentation. However, challenges such as small object sizes, ambiguous features, and limited generalization make it difficult for current methods to achieve this goal. In this work, we propose ROS-SAM, a method designed to achieve high-quality interactive segmentation while preserving… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12006v1-abstract-full').style.display = 'inline'; document.getElementById('2503.12006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.12006v1-abstract-full" style="display: none;"> The availability of large-scale remote sensing video data underscores the importance of high-quality interactive segmentation. However, challenges such as small object sizes, ambiguous features, and limited generalization make it difficult for current methods to achieve this goal. In this work, we propose ROS-SAM, a method designed to achieve high-quality interactive segmentation while preserving generalization across diverse remote sensing data. The ROS-SAM is built upon three key innovations: 1) LoRA-based fine-tuning, which enables efficient domain adaptation while maintaining SAM's generalization ability, 2) Enhancement of deep network layers to improve the discriminability of extracted features, thereby reducing misclassifications, and 3) Integration of global context with local boundary details in the mask decoder to generate high-quality segmentation masks. Additionally, we design the data pipeline to ensure the model learns to better handle objects at varying scales during training while focusing on high-quality predictions during inference. Experiments on remote sensing video datasets show that the redesigned data pipeline boosts the IoU by 6%, while ROS-SAM increases the IoU by 13%. Finally, when evaluated on existing remote sensing object tracking datasets, ROS-SAM demonstrates impressive zero-shot capabilities, generating masks that closely resemble manual annotations. These results confirm ROS-SAM as a powerful tool for fine-grained segmentation in remote sensing applications. Code is available at https://github.com/ShanZard/ROS-SAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.12006v1-abstract-full').style.display = 'none'; document.getElementById('2503.12006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.09248">arXiv:2503.09248</a> <span> [<a href="https://arxiv.org/pdf/2503.09248">pdf</a>, <a href="https://arxiv.org/format/2503.09248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bayesian Test-Time Adaptation for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lihua Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Mao Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaifeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nianxin Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiatian Zhu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Lei Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongbin Liu</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhen Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.09248v2-abstract-short" style="display: inline;"> Test-time adaptation with pre-trained vision-language models, such as CLIP, aims to adapt the model to new, potentially out-of-distribution test data. Existing methods calculate the similarity between visual embedding and learnable class embeddings, which are initialized by text embeddings, for zero-shot image classification. In this work, we first analyze this process based on Bayes theorem, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09248v2-abstract-full').style.display = 'inline'; document.getElementById('2503.09248v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.09248v2-abstract-full" style="display: none;"> Test-time adaptation with pre-trained vision-language models, such as CLIP, aims to adapt the model to new, potentially out-of-distribution test data. Existing methods calculate the similarity between visual embedding and learnable class embeddings, which are initialized by text embeddings, for zero-shot image classification. In this work, we first analyze this process based on Bayes theorem, and observe that the core factors influencing the final prediction are the likelihood and the prior. However, existing methods essentially focus on adapting class embeddings to adapt likelihood, but they often ignore the importance of prior. To address this gap, we propose a novel approach, \textbf{B}ayesian \textbf{C}lass \textbf{A}daptation (BCA), which in addition to continuously updating class embeddings to adapt likelihood, also uses the posterior of incoming samples to continuously update the prior for each class embedding. This dual updating mechanism allows the model to better adapt to distribution shifts and achieve higher prediction accuracy. Our method not only surpasses existing approaches in terms of performance metrics but also maintains superior inference rates and memory usage, making it highly efficient and practical for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.09248v2-abstract-full').style.display = 'none'; document.getElementById('2503.09248v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07565">arXiv:2503.07565</a> <span> [<a href="https://arxiv.org/pdf/2503.07565">pdf</a>, <a href="https://arxiv.org/format/2503.07565">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Inductive Moment Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Ermon%2C+S">Stefano Ermon</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiaming Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07565v6-abstract-short" style="display: inline;"> Diffusion models and Flow Matching generate high-quality samples but are slow at inference, and distilling them into few-step models often leads to instability and extensive tuning. To resolve these trade-offs, we propose Inductive Moment Matching (IMM), a new class of generative models for one- or few-step sampling with a single-stage training procedure. Unlike distillation, IMM does not require… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07565v6-abstract-full').style.display = 'inline'; document.getElementById('2503.07565v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07565v6-abstract-full" style="display: none;"> Diffusion models and Flow Matching generate high-quality samples but are slow at inference, and distilling them into few-step models often leads to instability and extensive tuning. To resolve these trade-offs, we propose Inductive Moment Matching (IMM), a new class of generative models for one- or few-step sampling with a single-stage training procedure. Unlike distillation, IMM does not require pre-training initialization and optimization of two networks; and unlike Consistency Models, IMM guarantees distribution-level convergence and remains stable under various hyperparameters and standard model architectures. IMM surpasses diffusion models on ImageNet-256x256 with 1.99 FID using only 8 inference steps and achieves state-of-the-art 2-step FID of 1.98 on CIFAR-10 for a model trained from scratch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07565v6-abstract-full').style.display = 'none'; document.getElementById('2503.07565v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.07154">arXiv:2503.07154</a> <span> [<a href="https://arxiv.org/pdf/2503.07154">pdf</a>, <a href="https://arxiv.org/format/2503.07154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ideas in Inference-time Scaling can Benefit Generative Pre-training Algorithms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiaming Song</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linqi Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.07154v2-abstract-short" style="display: inline;"> Recent years have seen significant advancements in foundation models through generative pre-training, yet algorithmic innovation in this space has largely stagnated around autoregressive models for discrete signals and diffusion models for continuous signals. This stagnation creates a bottleneck that prevents us from fully unlocking the potential of rich multi-modal data, which in turn limits the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07154v2-abstract-full').style.display = 'inline'; document.getElementById('2503.07154v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.07154v2-abstract-full" style="display: none;"> Recent years have seen significant advancements in foundation models through generative pre-training, yet algorithmic innovation in this space has largely stagnated around autoregressive models for discrete signals and diffusion models for continuous signals. This stagnation creates a bottleneck that prevents us from fully unlocking the potential of rich multi-modal data, which in turn limits the progress on multimodal intelligence. We argue that an inference-first perspective, which prioritizes scaling efficiency during inference time across sequence length and refinement steps, can inspire novel generative pre-training algorithms. Using Inductive Moment Matching (IMM) as a concrete example, we demonstrate how addressing limitations in diffusion models' inference process through targeted modifications yields a stable, single-stage algorithm that achieves superior sample quality with over an order of magnitude greater inference efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.07154v2-abstract-full').style.display = 'none'; document.getElementById('2503.07154v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.06378">arXiv:2503.06378</a> <span> [<a href="https://arxiv.org/pdf/2503.06378">pdf</a>, <a href="https://arxiv.org/format/2503.06378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> General Scales Unlock AI Evaluation with Explanatory and Predictive Power </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lexin Zhou</a>, <a href="/search/cs?searchtype=author&query=Pacchiardi%2C+L">Lorenzo Pacchiardi</a>, <a href="/search/cs?searchtype=author&query=Mart%C3%ADnez-Plumed%2C+F">Fernando Mart铆nez-Plumed</a>, <a href="/search/cs?searchtype=author&query=Collins%2C+K+M">Katherine M. Collins</a>, <a href="/search/cs?searchtype=author&query=Moros-Daval%2C+Y">Yael Moros-Daval</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Seraphina Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qinlin Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yitian Huang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Luning Sun</a>, <a href="/search/cs?searchtype=author&query=Prunty%2C+J+E">Jonathan E. Prunty</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongqian Li</a>, <a href="/search/cs?searchtype=author&query=S%C3%A1nchez-Garc%C3%ADa%2C+P">Pablo S谩nchez-Garc铆a</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K+J">Kexin Jiang Chen</a>, <a href="/search/cs?searchtype=author&query=Casares%2C+P+A+M">Pablo A. M. Casares</a>, <a href="/search/cs?searchtype=author&query=Zu%2C+J">Jiyun Zu</a>, <a href="/search/cs?searchtype=author&query=Burden%2C+J">John Burden</a>, <a href="/search/cs?searchtype=author&query=Mehrbakhsh%2C+B">Behzad Mehrbakhsh</a>, <a href="/search/cs?searchtype=author&query=Stillwell%2C+D">David Stillwell</a>, <a href="/search/cs?searchtype=author&query=Cebrian%2C+M">Manuel Cebrian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jindong Wang</a>, <a href="/search/cs?searchtype=author&query=Henderson%2C+P">Peter Henderson</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S+T">Sherry Tongshuang Wu</a>, <a href="/search/cs?searchtype=author&query=Kyllonen%2C+P+C">Patrick C. Kyllonen</a>, <a href="/search/cs?searchtype=author&query=Cheke%2C+L">Lucy Cheke</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xing Xie</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.06378v2-abstract-short" style="display: inline;"> Ensuring safe and effective use of AI requires understanding and anticipating its performance on novel tasks, from advanced scientific challenges to transformed workplace activities. So far, benchmarking has guided progress in AI, but it has offered limited explanatory and predictive power for general-purpose AI systems, given the low transferability across diverse tasks. In this paper, we introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06378v2-abstract-full').style.display = 'inline'; document.getElementById('2503.06378v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.06378v2-abstract-full" style="display: none;"> Ensuring safe and effective use of AI requires understanding and anticipating its performance on novel tasks, from advanced scientific challenges to transformed workplace activities. So far, benchmarking has guided progress in AI, but it has offered limited explanatory and predictive power for general-purpose AI systems, given the low transferability across diverse tasks. In this paper, we introduce general scales for AI evaluation that can explain what common AI benchmarks really measure, extract ability profiles of AI systems, and predict their performance for new task instances, in- and out-of-distribution. Our fully-automated methodology builds on 18 newly-crafted rubrics that place instance demands on general scales that do not saturate. Illustrated for 15 large language models and 63 tasks, high explanatory power is unleashed from inspecting the demand and ability profiles, bringing insights on the sensitivity and specificity exhibited by different benchmarks, and how knowledge, metacognition and reasoning are affected by model size, chain-of-thought and distillation. Surprisingly, high predictive power at the instance level becomes possible using these demand levels, providing superior estimates over black-box baseline predictors based on embeddings or finetuning, especially in out-of-distribution settings (new tasks and new benchmarks). The scales, rubrics, battery, techniques and results presented here represent a major step for AI evaluation, underpinning the reliable deployment of AI in the years ahead. (Collaborative platform: https://kinds-of-intelligence-cfi.github.io/ADELE.) <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.06378v2-abstract-full').style.display = 'none'; document.getElementById('2503.06378v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.03270">arXiv:2503.03270</a> <span> [<a href="https://arxiv.org/pdf/2503.03270">pdf</a>, <a href="https://arxiv.org/format/2503.03270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Reduced Spatial Dependency for More General Video-level Deepfake Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+B">Beilin Chu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuan Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yufei Zhang</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Weike You</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linna Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.03270v1-abstract-short" style="display: inline;"> As one of the prominent AI-generated content, Deepfake has raised significant safety concerns. Although it has been demonstrated that temporal consistency cues offer better generalization capability, existing methods based on CNNs inevitably introduce spatial bias, which hinders the extraction of intrinsic temporal features. To address this issue, we propose a novel method called Spatial Dependenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03270v1-abstract-full').style.display = 'inline'; document.getElementById('2503.03270v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.03270v1-abstract-full" style="display: none;"> As one of the prominent AI-generated content, Deepfake has raised significant safety concerns. Although it has been demonstrated that temporal consistency cues offer better generalization capability, existing methods based on CNNs inevitably introduce spatial bias, which hinders the extraction of intrinsic temporal features. To address this issue, we propose a novel method called Spatial Dependency Reduction (SDR), which integrates common temporal consistency features from multiple spatially-perturbed clusters, to reduce the dependency of the model on spatial information. Specifically, we design multiple Spatial Perturbation Branch (SPB) to construct spatially-perturbed feature clusters. Subsequently, we utilize the theory of mutual information and propose a Task-Relevant Feature Integration (TRFI) module to capture temporal features residing in similar latent space from these clusters. Finally, the integrated feature is fed into a temporal transformer to capture long-range dependencies. Extensive benchmarks and ablation studies demonstrate the effectiveness and rationale of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.03270v1-abstract-full').style.display = 'none'; document.getElementById('2503.03270v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures. Accepted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.02862">arXiv:2503.02862</a> <span> [<a href="https://arxiv.org/pdf/2503.02862">pdf</a>, <a href="https://arxiv.org/format/2503.02862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Privacy and Accuracy-Aware AI/ML Model Deduplication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+H">Hong Guan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lei Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lixi Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Li Xiong</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+K">Kanchan Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lulu Xie</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xusheng Xiao</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">Jia Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.02862v1-abstract-short" style="display: inline;"> With the growing adoption of privacy-preserving machine learning algorithms, such as Differentially Private Stochastic Gradient Descent (DP-SGD), training or fine-tuning models on private datasets has become increasingly prevalent. This shift has led to the need for models offering varying privacy guarantees and utility levels to satisfy diverse user requirements. However, managing numerous versio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02862v1-abstract-full').style.display = 'inline'; document.getElementById('2503.02862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.02862v1-abstract-full" style="display: none;"> With the growing adoption of privacy-preserving machine learning algorithms, such as Differentially Private Stochastic Gradient Descent (DP-SGD), training or fine-tuning models on private datasets has become increasingly prevalent. This shift has led to the need for models offering varying privacy guarantees and utility levels to satisfy diverse user requirements. However, managing numerous versions of large models introduces significant operational challenges, including increased inference latency, higher resource consumption, and elevated costs. Model deduplication is a technique widely used by many model serving and database systems to support high-performance and low-cost inference queries and model diagnosis queries. However, none of the existing model deduplication works has considered privacy, leading to unbounded aggregation of privacy costs for certain deduplicated models and inefficiencies when applied to deduplicate DP-trained models. We formalize the problems of deduplicating DP-trained models for the first time and propose a novel privacy- and accuracy-aware deduplication mechanism to address the problems. We developed a greedy strategy to select and assign base models to target models to minimize storage and privacy costs. When deduplicating a target model, we dynamically schedule accuracy validations and apply the Sparse Vector Technique to reduce the privacy costs associated with private validation data. Compared to baselines that do not provide privacy guarantees, our approach improved the compression ratio by up to $35\times$ for individual models (including large language models and vision transformers). We also observed up to $43\times$ inference speedup due to the reduction of I/O operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.02862v1-abstract-full').style.display = 'none'; document.getElementById('2503.02862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01699">arXiv:2503.01699</a> <span> [<a href="https://arxiv.org/pdf/2503.01699">pdf</a>, <a href="https://arxiv.org/format/2503.01699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Camera Measurement of Blood Oxygen Saturation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiankai Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=McDuff%2C+D">Daniel McDuff</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhang Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hongming Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luxi Zhou</a>, <a href="/search/cs?searchtype=author&query=Nagao%2C+N">Nodoka Nagao</a>, <a href="/search/cs?searchtype=author&query=Suzuki%2C+H">Haruta Suzuki</a>, <a href="/search/cs?searchtype=author&query=Nagahama%2C+Y">Yuki Nagahama</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+L">Linhong Ji</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuanchun Shi</a>, <a href="/search/cs?searchtype=author&query=Nishidate%2C+I">Izumi Nishidate</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuntao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01699v1-abstract-short" style="display: inline;"> Blood oxygen saturation (SpO2) is a crucial vital sign routinely monitored in medical settings. Traditional methods require dedicated contact sensors, limiting accessibility and comfort. This study presents a deep learning framework for contactless SpO2 measurement using an off-the-shelf camera, addressing challenges related to lighting variations and skin tone diversity. We conducted two large-sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01699v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01699v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01699v1-abstract-full" style="display: none;"> Blood oxygen saturation (SpO2) is a crucial vital sign routinely monitored in medical settings. Traditional methods require dedicated contact sensors, limiting accessibility and comfort. This study presents a deep learning framework for contactless SpO2 measurement using an off-the-shelf camera, addressing challenges related to lighting variations and skin tone diversity. We conducted two large-scale studies with diverse participants and evaluated our method against traditional signal processing approaches in intra- and inter-dataset scenarios. Our approach demonstrated consistent accuracy across demographic groups, highlighting the feasibility of camera-based SpO2 monitoring as a scalable and non-invasive tool for remote health assessment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01699v1-abstract-full').style.display = 'none'; document.getElementById('2503.01699v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00760">arXiv:2503.00760</a> <span> [<a href="https://arxiv.org/pdf/2503.00760">pdf</a>, <a href="https://arxiv.org/format/2503.00760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NCF: Neural Correspondence Field for Medical Image Registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lei Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+N">Nimu Yuan</a>, <a href="/search/cs?searchtype=author&query=Ehrlich%2C+K">Katjana Ehrlich</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jinyi Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00760v1-abstract-short" style="display: inline;"> Deformable image registration is a fundamental task in medical image processing. Traditional optimization-based methods often struggle with accuracy in dealing with complex deformation. Recently, learning-based methods have achieved good performance on public datasets, but the scarcity of medical image data makes it challenging to build a generalizable model to handle diverse real-world scenarios.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00760v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00760v1-abstract-full" style="display: none;"> Deformable image registration is a fundamental task in medical image processing. Traditional optimization-based methods often struggle with accuracy in dealing with complex deformation. Recently, learning-based methods have achieved good performance on public datasets, but the scarcity of medical image data makes it challenging to build a generalizable model to handle diverse real-world scenarios. To address this, we propose a training-data-free learning-based method, Neural Correspondence Field (NCF), which can learn from just one data pair. Our approach employs a compact neural network to model the correspondence field and optimize model parameters for each individual image pair. Consequently, each pair has a unique set of network weights. Notably, our model is highly efficient, utilizing only 0.06 million parameters. Evaluation results showed that the proposed method achieved superior performance on a public Lung CT dataset and outperformed a traditional method on a head and neck dataset, demonstrating both its effectiveness and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00760v1-abstract-full').style.display = 'none'; document.getElementById('2503.00760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20356">arXiv:2502.20356</a> <span> [<a href="https://arxiv.org/pdf/2502.20356">pdf</a>, <a href="https://arxiv.org/format/2502.20356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Creativity Understanding Gap: Small-Scale Human Alignment Enables Expert-Level Humor Ranking in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+K+L">Kuan Lok Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayi Chen</a>, <a href="/search/cs?searchtype=author&query=Suresh%2C+S">Siddharth Suresh</a>, <a href="/search/cs?searchtype=author&query=Narad%2C+R">Reuben Narad</a>, <a href="/search/cs?searchtype=author&query=Rogers%2C+T+T">Timothy T. Rogers</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+L+K">Lalit K Jain</a>, <a href="/search/cs?searchtype=author&query=Nowak%2C+R+D">Robert D Nowak</a>, <a href="/search/cs?searchtype=author&query=Mankoff%2C+B">Bob Mankoff</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jifan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20356v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown significant limitations in understanding creative content, as demonstrated by Hessel et al. (2023)'s influential work on the New Yorker Cartoon Caption Contest (NYCCC). Their study exposed a substantial gap between LLMs and humans in humor comprehension, establishing that understanding and evaluating creative content is key challenge in AI development. We re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20356v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20356v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown significant limitations in understanding creative content, as demonstrated by Hessel et al. (2023)'s influential work on the New Yorker Cartoon Caption Contest (NYCCC). Their study exposed a substantial gap between LLMs and humans in humor comprehension, establishing that understanding and evaluating creative content is key challenge in AI development. We revisit this challenge by decomposing humor understanding into three components and systematically improve each: enhancing visual understanding through improved annotation, utilizing LLM-generated humor reasoning and explanations, and implementing targeted alignment with human preference data. Our refined approach achieves 82.4% accuracy in caption ranking, singificantly improving upon the previous 67% benchmark and matching the performance of world-renowned human experts in this domain. Notably, while attempts to mimic subgroup preferences through various persona prompts showed minimal impact, model finetuning with crowd preferences proved remarkably effective. These findings reveal that LLM limitations in creative judgment can be effectively addressed through focused alignment to specific subgroups and individuals. Lastly, we propose the position that achieving artificial general intelligence necessitates systematic collection of human preference data across creative domains. We advocate that just as human creativity is deeply influenced by individual and cultural preferences, training LLMs with diverse human preference data may be essential for developing true creative understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20356v1-abstract-full').style.display = 'none'; document.getElementById('2502.20356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19706">arXiv:2502.19706</a> <span> [<a href="https://arxiv.org/pdf/2502.19706">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> AoECR: AI-ization of Elderly Care Robot </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linkun Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+Y">Yadong Mo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+S">Shimin Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19706v1-abstract-short" style="display: inline;"> Autonomous interaction is crucial for the effective use of elderly care robots. However, developing universal AI architectures is extremely challenging due to the diversity in robot configurations and a lack of dataset. We proposed a universal architecture for the AI-ization of elderly care robots, called AoECR. Specifically, based on a nursing bed, we developed a patient-nurse interaction dataset… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19706v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19706v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19706v1-abstract-full" style="display: none;"> Autonomous interaction is crucial for the effective use of elderly care robots. However, developing universal AI architectures is extremely challenging due to the diversity in robot configurations and a lack of dataset. We proposed a universal architecture for the AI-ization of elderly care robots, called AoECR. Specifically, based on a nursing bed, we developed a patient-nurse interaction dataset tailored for elderly care scenarios and fine-tuned a large language model to enable it to perform nursing manipulations. Additionally, the inference process included a self-check chain to ensure the security of control commands. An expert optimization process further enhanced the humanization and personalization of the interactive responses. The physical experiment demonstrated that the AoECR exhibited zero-shot generalization capabilities across diverse scenarios, understood patients' instructions, implemented secure control commands, and delivered humanized and personalized interactive responses. In general, our research provides a valuable dataset reference and AI-ization solutions for elderly care robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19706v1-abstract-full').style.display = 'none'; document.getElementById('2502.19706v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18968">arXiv:2502.18968</a> <span> [<a href="https://arxiv.org/pdf/2502.18968">pdf</a>, <a href="https://arxiv.org/format/2502.18968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Know You First and Be You Better: Modeling Human-Like User Simulators via Implicit Profiles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kuang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xianfei Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shenghao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Feng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18968v2-abstract-short" style="display: inline;"> User simulators are crucial for replicating human interactions with dialogue systems, supporting both collaborative training and automatic evaluation, especially for large language models (LLMs). However, existing simulators often rely solely on text utterances, missing implicit user traits such as personality, speaking style, and goals. In contrast, persona-based methods lack generalizability, as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18968v2-abstract-full').style.display = 'inline'; document.getElementById('2502.18968v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18968v2-abstract-full" style="display: none;"> User simulators are crucial for replicating human interactions with dialogue systems, supporting both collaborative training and automatic evaluation, especially for large language models (LLMs). However, existing simulators often rely solely on text utterances, missing implicit user traits such as personality, speaking style, and goals. In contrast, persona-based methods lack generalizability, as they depend on predefined profiles of famous individuals or archetypes. To address these challenges, we propose User Simulator with implicit Profiles (USP), a framework that infers implicit user profiles from human-machine conversations and uses them to generate more personalized and realistic dialogues. We first develop an LLM-driven extractor with a comprehensive profile schema. Then, we refine the simulation through conditional supervised fine-tuning and reinforcement learning with cycle consistency, optimizing it at both the utterance and conversation levels. Finally, we adopt a diverse profile sampler to capture the distribution of real-world user profiles. Experimental results demonstrate that USP outperforms strong baselines in terms of authenticity and diversity while achieving comparable performance in consistency. Furthermore, dynamic multi-turn evaluations based on USP strongly align with mainstream benchmarks, demonstrating its effectiveness in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18968v2-abstract-full').style.display = 'none'; document.getElementById('2502.18968v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18297">arXiv:2502.18297</a> <span> [<a href="https://arxiv.org/pdf/2502.18297">pdf</a>, <a href="https://arxiv.org/format/2502.18297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> DeepCircuitX: A Comprehensive Repository-Level Dataset for RTL Code Understanding, Generation, and PPA Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeju Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Changran Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zedong Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yunhao Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lingfeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Jianyuan Zhong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jieru Zhao</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhufei Chu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyan Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18297v1-abstract-short" style="display: inline;"> This paper introduces DeepCircuitX, a comprehensive repository-level dataset designed to advance RTL (Register Transfer Level) code understanding, generation, and power-performance-area (PPA) analysis. Unlike existing datasets that are limited to either file-level RTL code or physical layout data, DeepCircuitX provides a holistic, multilevel resource that spans repository, file, module, and block-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18297v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18297v1-abstract-full" style="display: none;"> This paper introduces DeepCircuitX, a comprehensive repository-level dataset designed to advance RTL (Register Transfer Level) code understanding, generation, and power-performance-area (PPA) analysis. Unlike existing datasets that are limited to either file-level RTL code or physical layout data, DeepCircuitX provides a holistic, multilevel resource that spans repository, file, module, and block-level RTL code. This structure enables more nuanced training and evaluation of large language models (LLMs) for RTL-specific tasks. DeepCircuitX is enriched with Chain of Thought (CoT) annotations, offering detailed descriptions of functionality and structure at multiple levels. These annotations enhance its utility for a wide range of tasks, including RTL code understanding, generation, and completion. Additionally, the dataset includes synthesized netlists and PPA metrics, facilitating early-stage design exploration and enabling accurate PPA prediction directly from RTL code. We demonstrate the dataset's effectiveness on various LLMs finetuned with our dataset and confirm the quality with human evaluations. Our results highlight DeepCircuitX as a critical resource for advancing RTL-focused machine learning applications in hardware design automation.Our data is available at https://zeju.gitbook.io/lcm-team. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18297v1-abstract-full').style.display = 'none'; document.getElementById('2502.18297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14932">arXiv:2502.14932</a> <span> [<a href="https://arxiv.org/pdf/2502.14932">pdf</a>, <a href="https://arxiv.org/format/2502.14932">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning to Retrieve and Reason on Knowledge Graph through Active Self-Reflection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Langshi Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hanfang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14932v1-abstract-short" style="display: inline;"> Extensive research has investigated the integration of large language models (LLMs) with knowledge graphs to enhance the reasoning process. However, understanding how models perform reasoning utilizing structured graph knowledge remains underexplored. Most existing approaches rely on LLMs or retrievers to make binary judgments regarding the utilization of knowledge, which is too coarse. Meanwhile,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14932v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14932v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14932v1-abstract-full" style="display: none;"> Extensive research has investigated the integration of large language models (LLMs) with knowledge graphs to enhance the reasoning process. However, understanding how models perform reasoning utilizing structured graph knowledge remains underexplored. Most existing approaches rely on LLMs or retrievers to make binary judgments regarding the utilization of knowledge, which is too coarse. Meanwhile, there is still a lack of feedback mechanisms for reflection and correction throughout the entire reasoning path. This paper proposes an Active self-Reflection framework for knowledge Graph reasoning ARG, introducing for the first time an end-to-end training approach to achieve iterative reasoning grounded on structured graphs. Within the framework, the model leverages special tokens to \textit{actively} determine whether knowledge retrieval is necessary, performs \textit{reflective} critique based on the retrieved knowledge, and iteratively reasons over the knowledge graph. The reasoning paths generated by the model exhibit high interpretability, enabling deeper exploration of the model's understanding of structured knowledge. Ultimately, the proposed model achieves outstanding results compared to existing baselines in knowledge graph reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14932v1-abstract-full').style.display = 'none'; document.getElementById('2502.14932v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14558">arXiv:2502.14558</a> <span> [<a href="https://arxiv.org/pdf/2502.14558">pdf</a>, <a href="https://arxiv.org/format/2502.14558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FUIA: Model Inversion Attack against Federated Unlearning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lei Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Youwen Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14558v2-abstract-short" style="display: inline;"> With the introduction of regulations related to the ``right to be forgotten", federated learning (FL) is facing new privacy compliance challenges. To address these challenges, researchers have proposed federated unlearning (FU). However, existing FU research has primarily focused on improving the efficiency of unlearning, with less attention paid to the potential privacy vulnerabilities inherent i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14558v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14558v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14558v2-abstract-full" style="display: none;"> With the introduction of regulations related to the ``right to be forgotten", federated learning (FL) is facing new privacy compliance challenges. To address these challenges, researchers have proposed federated unlearning (FU). However, existing FU research has primarily focused on improving the efficiency of unlearning, with less attention paid to the potential privacy vulnerabilities inherent in these methods. To address this gap, we draw inspiration from gradient inversion attacks in FL and propose the federated unlearning inversion attack (FUIA). The FUIA is specifically designed for the three types of FU (sample unlearning, client unlearning, and class unlearning), aiming to provide a comprehensive analysis of the privacy leakage risks associated with FU. In FUIA, the server acts as an honest-but-curious attacker, recording and exploiting the model differences before and after unlearning to expose the features and labels of forgotten data. FUIA significantly leaks the privacy of forgotten data and can target all types of FU. This attack contradicts the goal of FU to eliminate specific data influence, instead exploiting its vulnerabilities to recover forgotten data and expose its privacy flaws. Extensive experimental results show that FUIA can effectively reveal the private information of forgotten data. To mitigate this privacy leakage, we also explore two potential defense methods, although these come at the cost of reduced unlearning effectiveness and the usability of the unlearned model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14558v2-abstract-full').style.display = 'none'; document.getElementById('2502.14558v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14445">arXiv:2502.14445</a> <span> [<a href="https://arxiv.org/pdf/2502.14445">pdf</a>, <a href="https://arxiv.org/format/2502.14445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> PredictaBoard: Benchmarking LLM Score Predictability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pacchiardi%2C+L">Lorenzo Pacchiardi</a>, <a href="/search/cs?searchtype=author&query=Voudouris%2C+K">Konstantinos Voudouris</a>, <a href="/search/cs?searchtype=author&query=Slater%2C+B">Ben Slater</a>, <a href="/search/cs?searchtype=author&query=Mart%C3%ADnez-Plumed%2C+F">Fernando Mart铆nez-Plumed</a>, <a href="/search/cs?searchtype=author&query=Hern%C3%A1ndez-Orallo%2C+J">Jos茅 Hern谩ndez-Orallo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lexin Zhou</a>, <a href="/search/cs?searchtype=author&query=Schellaert%2C+W">Wout Schellaert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14445v1-abstract-short" style="display: inline;"> Despite possessing impressive skills, Large Language Models (LLMs) often fail unpredictably, demonstrating inconsistent success in even basic common sense reasoning tasks. This unpredictability poses a significant challenge to ensuring their safe deployment, as identifying and operating within a reliable "safe zone" is essential for mitigating risks. To address this, we present PredictaBoard, a no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14445v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14445v1-abstract-full" style="display: none;"> Despite possessing impressive skills, Large Language Models (LLMs) often fail unpredictably, demonstrating inconsistent success in even basic common sense reasoning tasks. This unpredictability poses a significant challenge to ensuring their safe deployment, as identifying and operating within a reliable "safe zone" is essential for mitigating risks. To address this, we present PredictaBoard, a novel collaborative benchmarking framework designed to evaluate the ability of score predictors (referred to as assessors) to anticipate LLM errors on specific task instances (i.e., prompts) from existing datasets. PredictaBoard evaluates pairs of LLMs and assessors by considering the rejection rate at different tolerance errors. As such, PredictaBoard stimulates research into developing better assessors and making LLMs more predictable, not only with a higher average performance. We conduct illustrative experiments using baseline assessors and state-of-the-art LLMs. PredictaBoard highlights the critical need to evaluate predictability alongside performance, paving the way for safer AI systems where errors are not only minimised but also anticipated and effectively mitigated. Code for our benchmark can be found at https://github.com/Kinds-of-Intelligence-CFI/PredictaBoard <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14445v1-abstract-full').style.display = 'none'; document.getElementById('2502.14445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11193">arXiv:2502.11193</a> <span> [<a href="https://arxiv.org/pdf/2502.11193">pdf</a>, <a href="https://arxiv.org/format/2502.11193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models Penetration in Scholarly Writing and Peer Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijie Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xunlian Dai</a>, <a href="/search/cs?searchtype=author&query=Hershcovich%2C+D">Daniel Hershcovich</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11193v1-abstract-short" style="display: inline;"> While the widespread use of Large Language Models (LLMs) brings convenience, it also raises concerns about the credibility of academic research and scholarly processes. To better understand these dynamics, we evaluate the penetration of LLMs across academic workflows from multiple perspectives and dimensions, providing compelling evidence of their growing influence. We propose a framework with two… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11193v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11193v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11193v1-abstract-full" style="display: none;"> While the widespread use of Large Language Models (LLMs) brings convenience, it also raises concerns about the credibility of academic research and scholarly processes. To better understand these dynamics, we evaluate the penetration of LLMs across academic workflows from multiple perspectives and dimensions, providing compelling evidence of their growing influence. We propose a framework with two components: \texttt{ScholarLens}, a curated dataset of human- and LLM-generated content across scholarly writing and peer review for multi-perspective evaluation, and \texttt{LLMetrica}, a tool for assessing LLM penetration using rule-based metrics and model-based detectors for multi-dimensional evaluation. Our experiments demonstrate the effectiveness of \texttt{LLMetrica}, revealing the increasing role of LLMs in scholarly processes. These findings emphasize the need for transparency, accountability, and ethical practices in LLM usage to maintain academic credibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11193v1-abstract-full').style.display = 'none'; document.getElementById('2502.11193v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Transparency in NLP, LLM-generated text evaluation and detection, LLM Penetration, Scholarly Credibility and Accountability</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06816">arXiv:2502.06816</a> <span> [<a href="https://arxiv.org/pdf/2502.06816">pdf</a>, <a href="https://arxiv.org/format/2502.06816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepCell: Multiview Representation Learning for Post-Mapping Netlists </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhengyuan Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chengyu Ma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziyang Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lingfeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+H">Hongyang Pan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wentao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyan Yang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Z">Zhufei Chu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06816v1-abstract-short" style="display: inline;"> Representation learning for post-mapping (PM) netlists is a critical challenge in Electronic Design Automation (EDA), driven by the diverse and complex nature of modern circuit designs. Existing approaches focus on intermediate representations like And-Inverter Graphs (AIGs), limiting their applicability to post-synthesis stages. We introduce DeepCell, a multiview representation learning framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06816v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06816v1-abstract-full" style="display: none;"> Representation learning for post-mapping (PM) netlists is a critical challenge in Electronic Design Automation (EDA), driven by the diverse and complex nature of modern circuit designs. Existing approaches focus on intermediate representations like And-Inverter Graphs (AIGs), limiting their applicability to post-synthesis stages. We introduce DeepCell, a multiview representation learning framework that integrates structural and functional insights from both PM netlists and AIGs to learn rich, generalizable embeddings. At its core, DeepCell employs the novel Mask Circuit Modeling (MCM) mechanism, which refines PM netlist representations in a self-supervised manner using pretrained AIG encoders. DeepCell sets a new benchmark in PM netlist representation, outperforming existing methods in predictive accuracy and reconstruction fidelity. To validate its efficacy, we apply DeepCell to functional Engineering Change Orders (ECO), achieving significant reductions in patch generation costs and runtime while improving patch quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06816v1-abstract-full').style.display = 'none'; document.getElementById('2502.06816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04711">arXiv:2502.04711</a> <span> [<a href="https://arxiv.org/pdf/2502.04711">pdf</a>, <a href="https://arxiv.org/format/2502.04711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Frequency-Adaptive Knowledge Distillation for Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xihao Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siqi Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanting Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04711v1-abstract-short" style="display: inline;"> Deep learning-based speech enhancement (SE) models have recently outperformed traditional techniques, yet their deployment on resource-constrained devices remains challenging due to high computational and memory demands. This paper introduces a novel dynamic frequency-adaptive knowledge distillation (DFKD) approach to effectively compress SE models. Our method dynamically assesses the model's outp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04711v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04711v1-abstract-full" style="display: none;"> Deep learning-based speech enhancement (SE) models have recently outperformed traditional techniques, yet their deployment on resource-constrained devices remains challenging due to high computational and memory demands. This paper introduces a novel dynamic frequency-adaptive knowledge distillation (DFKD) approach to effectively compress SE models. Our method dynamically assesses the model's output, distinguishing between high and low-frequency components, and adapts the learning objectives to meet the unique requirements of different frequency bands, capitalizing on the SE task's inherent characteristics. To evaluate the DFKD's efficacy, we conducted experiments on three state-of-the-art models: DCCRN, ConTasNet, and DPTNet. The results demonstrate that our method not only significantly enhances the performance of the compressed model (student model) but also surpasses other logit-based knowledge distillation methods specifically for SE tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04711v1-abstract-full').style.display = 'none'; document.getElementById('2502.04711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03814">arXiv:2502.03814</a> <span> [<a href="https://arxiv.org/pdf/2502.03814">pdf</a>, <a href="https://arxiv.org/format/2502.03814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Multi-Robot Systems: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+P">Peihan Li</a>, <a href="/search/cs?searchtype=author&query=An%2C+Z">Zijian An</a>, <a href="/search/cs?searchtype=author&query=Abrar%2C+S">Shams Abrar</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lifeng Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03814v3-abstract-short" style="display: inline;"> The rapid advancement of Large Language Models (LLMs) has opened new possibilities in Multi-Robot Systems (MRS), enabling enhanced communication, task planning, and human-robot interaction. Unlike traditional single-robot and multi-agent systems, MRS poses unique challenges, including coordination, scalability, and real-world adaptability. This survey provides the first comprehensive exploration o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03814v3-abstract-full').style.display = 'inline'; document.getElementById('2502.03814v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03814v3-abstract-full" style="display: none;"> The rapid advancement of Large Language Models (LLMs) has opened new possibilities in Multi-Robot Systems (MRS), enabling enhanced communication, task planning, and human-robot interaction. Unlike traditional single-robot and multi-agent systems, MRS poses unique challenges, including coordination, scalability, and real-world adaptability. This survey provides the first comprehensive exploration of LLM integration into MRS. It systematically categorizes their applications across high-level task allocation, mid-level motion planning, low-level action generation, and human intervention. We highlight key applications in diverse domains, such as household robotics, construction, formation control, target tracking, and robot games, showcasing the versatility and transformative potential of LLMs in MRS. Furthermore, we examine the challenges that limit adapting LLMs in MRS, including mathematical reasoning limitations, hallucination, latency issues, and the need for robust benchmarking systems. Finally, we outline opportunities for future research, emphasizing advancements in fine-tuning, reasoning techniques, and task-specific models. This survey aims to guide researchers in the intelligence and real-world deployment of MRS powered by LLMs. Based on the fast-evolving nature of research in the field, we keep updating the papers in the open-source Github repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03814v3-abstract-full').style.display = 'none'; document.getElementById('2502.03814v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00359">arXiv:2502.00359</a> <span> [<a href="https://arxiv.org/pdf/2502.00359">pdf</a>, <a href="https://arxiv.org/format/2502.00359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring Representation-Aligned Latent Space for Better Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wanghan Xu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiaoyu Yue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zidong Wang</a>, <a href="/search/cs?searchtype=author&query=Teng%2C+Y">Yao Teng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xihui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00359v1-abstract-short" style="display: inline;"> Generative models serve as powerful tools for modeling the real world, with mainstream diffusion models, particularly those based on the latent diffusion model paradigm, achieving remarkable progress across various tasks, such as image and video synthesis. Latent diffusion models are typically trained using Variational Autoencoders (VAEs), interacting with VAE latents rather than the real samples.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00359v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00359v1-abstract-full" style="display: none;"> Generative models serve as powerful tools for modeling the real world, with mainstream diffusion models, particularly those based on the latent diffusion model paradigm, achieving remarkable progress across various tasks, such as image and video synthesis. Latent diffusion models are typically trained using Variational Autoencoders (VAEs), interacting with VAE latents rather than the real samples. While this generative paradigm speeds up training and inference, the quality of the generated outputs is limited by the latents' quality. Traditional VAE latents are often seen as spatial compression in pixel space and lack explicit semantic representations, which are essential for modeling the real world. In this paper, we introduce ReaLS (Representation-Aligned Latent Space), which integrates semantic priors to improve generation performance. Extensive experiments show that fundamental DiT and SiT trained on ReaLS can achieve a 15% improvement in FID metric. Furthermore, the enhanced semantic latent space enables more perceptual downstream tasks, such as segmentation and depth estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00359v1-abstract-full').style.display = 'none'; document.getElementById('2502.00359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15828">arXiv:2501.15828</a> <span> [<a href="https://arxiv.org/pdf/2501.15828">pdf</a>, <a href="https://arxiv.org/format/2501.15828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Quantum Neural Networks with Amplitude Encoding: Advancing Recovery Rate Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Griffin%2C+P">Paul Griffin</a>, <a href="/search/cs?searchtype=author&query=Recchia%2C+P">Paolo Recchia</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lei Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongrui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15828v4-abstract-short" style="display: inline;"> Recovery rate prediction plays a pivotal role in bond investment strategies, enhancing risk assessment, optimizing portfolio allocation, improving pricing accuracy, and supporting effective credit risk management. However, forecasting faces challenges like high-dimensional features, small sample sizes, and overfitting. We propose a hybrid Quantum Machine Learning model incorporating Parameterized… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15828v4-abstract-full').style.display = 'inline'; document.getElementById('2501.15828v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15828v4-abstract-full" style="display: none;"> Recovery rate prediction plays a pivotal role in bond investment strategies, enhancing risk assessment, optimizing portfolio allocation, improving pricing accuracy, and supporting effective credit risk management. However, forecasting faces challenges like high-dimensional features, small sample sizes, and overfitting. We propose a hybrid Quantum Machine Learning model incorporating Parameterized Quantum Circuits (PQC) within a neural network framework. PQCs inherently preserve unitarity, avoiding computationally costly orthogonality constraints, while amplitude encoding enables exponential data compression, reducing qubit requirements logarithmically. Applied to a global dataset of 1,725 observations (1996-2023), our method achieved superior accuracy (RMSE 0.228) compared to classical neural networks (0.246) and quantum models with angle encoding (0.242), with efficient computation times. This work highlights the potential of hybrid quantum-classical architectures in advancing recovery rate forecasting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15828v4-abstract-full').style.display = 'none'; document.getElementById('2501.15828v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15238">arXiv:2501.15238</a> <span> [<a href="https://arxiv.org/pdf/2501.15238">pdf</a>, <a href="https://arxiv.org/ps/2501.15238">ps</a>, <a href="https://arxiv.org/format/2501.15238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Complete Quantum Relational Hoare Logics from Optimal Transport Duality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Barthe%2C+G">Gilles Barthe</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Minbo Gao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Theo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15238v1-abstract-short" style="display: inline;"> We introduce a quantitative relational Hoare logic for quantum programs. Assertions of the logic range over a new infinitary extension of positive semidefinite operators. We prove that our logic is sound, and complete for bounded postconditions and almost surely terminating programs. Our completeness result is based on a quantum version of the duality theorem from optimal transport. We also define… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15238v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15238v1-abstract-full" style="display: none;"> We introduce a quantitative relational Hoare logic for quantum programs. Assertions of the logic range over a new infinitary extension of positive semidefinite operators. We prove that our logic is sound, and complete for bounded postconditions and almost surely terminating programs. Our completeness result is based on a quantum version of the duality theorem from optimal transport. We also define a complete embedding into our logic of a relational Hoare logic with projective assertions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15238v1-abstract-full').style.display = 'none'; document.getElementById('2501.15238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13859">arXiv:2501.13859</a> <span> [<a href="https://arxiv.org/pdf/2501.13859">pdf</a>, <a href="https://arxiv.org/format/2501.13859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning Visual Proxy for Compositional Zero-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+C">Cheng Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+C">Chenchen Jing</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lei Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13859v3-abstract-short" style="display: inline;"> Compositional Zero-Shot Learning (CZSL) aims to recognize novel attribute-object compositions by leveraging knowledge from seen compositions. Existing methods align textual prototypes with visual features through Vision-Language Models (VLMs), but they face two key limitations: (1) modality gaps hinder the discrimination of semantically similar composition pairs, and (2) single-modal textual proto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13859v3-abstract-full').style.display = 'inline'; document.getElementById('2501.13859v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13859v3-abstract-full" style="display: none;"> Compositional Zero-Shot Learning (CZSL) aims to recognize novel attribute-object compositions by leveraging knowledge from seen compositions. Existing methods align textual prototypes with visual features through Vision-Language Models (VLMs), but they face two key limitations: (1) modality gaps hinder the discrimination of semantically similar composition pairs, and (2) single-modal textual prototypes lack fine-grained visual cues, creating bottlenecks in VLM-based CZSL. In this paper, we introduce Visual Proxy Learning, a novel approach that facilitates the learning of distinct visual distributions, effectively reducing the modality gap and improving compositional generalization performance. Specifically, we initialize visual proxies for various attributes, objects, and their compositions using text representations. By optimizing the visual space, we capture fine-grained visual cues and guide the learning of more discriminative visual representations for attributes, objects and compositions. Furthermore, we propose an effective Cross-Modal Joint Learning (CMJL) strategy that imposes cross-modal constraints between the original text-image space and the fine-grained visual space. This approach not only boosts generalization for previously unseen composition pairs but also sharpens the discrimination of similar pairs, fostering more robust and precise learning. Extensive experiments demonstrate state-of-the-art performance in closed-world scenarios and competitive open-world results across four established CZSL benchmarks, validating the effectiveness of our approach in advancing compositional generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13859v3-abstract-full').style.display = 'none'; document.getElementById('2501.13859v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12602">arXiv:2501.12602</a> <span> [<a href="https://arxiv.org/pdf/2501.12602">pdf</a>, <a href="https://arxiv.org/format/2501.12602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> BLR-MoE: Boosted Language-Routing Mixture of Experts for Domain-Robust Multilingual E2E ASR </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guodong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuting Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuke Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Binbin Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12602v1-abstract-short" style="display: inline;"> Recently, the Mixture of Expert (MoE) architecture, such as LR-MoE, is often used to alleviate the impact of language confusion on the multilingual ASR (MASR) task. However, it still faces language confusion issues, especially in mismatched domain scenarios. In this paper, we decouple language confusion in LR-MoE into confusion in self-attention and router. To alleviate the language confusion in s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12602v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12602v1-abstract-full" style="display: none;"> Recently, the Mixture of Expert (MoE) architecture, such as LR-MoE, is often used to alleviate the impact of language confusion on the multilingual ASR (MASR) task. However, it still faces language confusion issues, especially in mismatched domain scenarios. In this paper, we decouple language confusion in LR-MoE into confusion in self-attention and router. To alleviate the language confusion in self-attention, based on LR-MoE, we propose to apply attention-MoE architecture for MASR. In our new architecture, MoE is utilized not only on feed-forward network (FFN) but also on self-attention. In addition, to improve the robustness of the LID-based router on language confusion, we propose expert pruning and router augmentation methods. Combining the above, we get the boosted language-routing MoE (BLR-MoE) architecture. We verify the effectiveness of the proposed BLR-MoE in a 10,000-hour MASR dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12602v1-abstract-full').style.display = 'none'; document.getElementById('2501.12602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07397">arXiv:2501.07397</a> <span> [<a href="https://arxiv.org/pdf/2501.07397">pdf</a>, <a href="https://arxiv.org/format/2501.07397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniEraser: Remove Objects and Their Effects in Images with Paired Video-Frame Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+R">Runpu Wei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zijin Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lanxiang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueyi Wang</a>, <a href="/search/cs?searchtype=author&query=Ban%2C+C">Chao Ban</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+T">Tianwei Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhongjiang He</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kongming Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhanyu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07397v3-abstract-short" style="display: inline;"> Inpainting algorithms have achieved remarkable progress in removing objects from images, yet still face two challenges: 1) struggle to handle the object's visual effects such as shadow and reflection; 2) easily generate shape-like artifacts and unintended content. In this paper, we propose Video4Removal, a large-scale dataset comprising over 100,000 high-quality samples with realistic object shado… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07397v3-abstract-full').style.display = 'inline'; document.getElementById('2501.07397v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07397v3-abstract-full" style="display: none;"> Inpainting algorithms have achieved remarkable progress in removing objects from images, yet still face two challenges: 1) struggle to handle the object's visual effects such as shadow and reflection; 2) easily generate shape-like artifacts and unintended content. In this paper, we propose Video4Removal, a large-scale dataset comprising over 100,000 high-quality samples with realistic object shadows and reflections. By constructing object-background pairs from video frames with off-the-shelf vision models, the labor costs of data acquisition can be significantly reduced. To avoid generating shape-like artifacts and unintended content, we propose Object-Background Guidance, an elaborated paradigm that takes both the foreground object and background images. It can guide the diffusion process to harness richer contextual information. Based on the above two designs, we present OmniEraser, a novel method that seamlessly removes objects and their visual effects using only object masks as input. Extensive experiments show that OmniEraser significantly outperforms previous methods, particularly in complex in-the-wild scenes. And it also exhibits a strong generalization ability in anime-style images. Datasets, models, and codes will be published. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07397v3-abstract-full').style.display = 'none'; document.getElementById('2501.07397v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06678">arXiv:2501.06678</a> <span> [<a href="https://arxiv.org/pdf/2501.06678">pdf</a>, <a href="https://arxiv.org/format/2501.06678">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Imbalanced Medical Image Segmentation with Pixel-dependent Noisy Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+E">Erjian Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06678v1-abstract-short" style="display: inline;"> Accurate medical image segmentation is often hindered by noisy labels in training data, due to the challenges of annotating medical images. Prior research works addressing noisy labels tend to make class-dependent assumptions, overlooking the pixel-dependent nature of most noisy labels. Furthermore, existing methods typically apply fixed thresholds to filter out noisy labels, risking the removal o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06678v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06678v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06678v1-abstract-full" style="display: none;"> Accurate medical image segmentation is often hindered by noisy labels in training data, due to the challenges of annotating medical images. Prior research works addressing noisy labels tend to make class-dependent assumptions, overlooking the pixel-dependent nature of most noisy labels. Furthermore, existing methods typically apply fixed thresholds to filter out noisy labels, risking the removal of minority classes and consequently degrading segmentation performance. To bridge these gaps, our proposed framework, Collaborative Learning with Curriculum Selection (CLCS), addresses pixel-dependent noisy labels with class imbalance. CLCS advances the existing works by i) treating noisy labels as pixel-dependent and addressing them through a collaborative learning framework, and ii) employing a curriculum dynamic thresholding approach adapting to model learning progress to select clean data samples to mitigate the class imbalance issue, and iii) applying a noise balance loss to noisy data samples to improve data utilization instead of discarding them outright. Specifically, our CLCS contains two modules: Curriculum Noisy Label Sample Selection (CNS) and Noise Balance Loss (NBL). In the CNS module, we designed a two-branch network with discrepancy loss for collaborative learning so that different feature representations of the same instance could be extracted from distinct views and used to vote the class probabilities of pixels. Besides, a curriculum dynamic threshold is adopted to select clean-label samples through probability voting. In the NBL module, instead of directly dropping the suspiciously noisy labels, we further adopt a robust loss to leverage such instances to boost the performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06678v1-abstract-full').style.display = 'none'; document.getElementById('2501.06678v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06655">arXiv:2501.06655</a> <span> [<a href="https://arxiv.org/pdf/2501.06655">pdf</a>, <a href="https://arxiv.org/format/2501.06655">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Personalized Preference Fine-tuning of Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dang%2C+M">Meihua Dang</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Anikait Singh</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Ermon%2C+S">Stefano Ermon</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiaming Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06655v1-abstract-short" style="display: inline;"> RLHF techniques like DPO can significantly improve the generation quality of text-to-image diffusion models. However, these methods optimize for a single reward that aligns model generation with population-level preferences, neglecting the nuances of individual users' beliefs or values. This lack of personalization limits the efficacy of these models. To bridge this gap, we introduce PPD, a multi-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06655v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06655v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06655v1-abstract-full" style="display: none;"> RLHF techniques like DPO can significantly improve the generation quality of text-to-image diffusion models. However, these methods optimize for a single reward that aligns model generation with population-level preferences, neglecting the nuances of individual users' beliefs or values. This lack of personalization limits the efficacy of these models. To bridge this gap, we introduce PPD, a multi-reward optimization objective that aligns diffusion models with personalized preferences. With PPD, a diffusion model learns the individual preferences of a population of users in a few-shot way, enabling generalization to unseen users. Specifically, our approach (1) leverages a vision-language model (VLM) to extract personal preference embeddings from a small set of pairwise preference examples, and then (2) incorporates the embeddings into diffusion models through cross attention. Conditioning on user embeddings, the text-to-image models are fine-tuned with the DPO objective, simultaneously optimizing for alignment with the preferences of multiple users. Empirical results demonstrate that our method effectively optimizes for multiple reward functions and can interpolate between them during inference. In real-world user scenarios, with as few as four preference examples from a new user, our approach achieves an average win rate of 76\% over Stable Cascade, generating images that more accurately reflect specific user preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06655v1-abstract-full').style.display = 'none'; document.getElementById('2501.06655v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00353">arXiv:2501.00353</a> <span> [<a href="https://arxiv.org/pdf/2501.00353">pdf</a>, <a href="https://arxiv.org/format/2501.00353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RAG-Instruct: Boosting LLMs with Diverse Retrieval-Augmented Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wanlong Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junying Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+K">Ke Ji</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00353v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has emerged as a key paradigm for enhancing large language models (LLMs) by incorporating external knowledge. However, current RAG methods face two limitations: (1) they only cover limited RAG scenarios. (2) They suffer from limited task diversity due to the lack of a general RAG dataset. To address these limitations, we propose RAG-Instruct, a general method f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00353v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00353v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has emerged as a key paradigm for enhancing large language models (LLMs) by incorporating external knowledge. However, current RAG methods face two limitations: (1) they only cover limited RAG scenarios. (2) They suffer from limited task diversity due to the lack of a general RAG dataset. To address these limitations, we propose RAG-Instruct, a general method for synthesizing diverse and high-quality RAG instruction data based on any source corpus. Our approach leverages (1) five RAG paradigms, which encompass diverse query-document relationships, and (2) instruction simulation, which enhances instruction diversity and quality by utilizing the strengths of existing instruction datasets. Using this method, we construct a 40K instruction dataset from Wikipedia, comprehensively covering diverse RAG scenarios and tasks. Experiments demonstrate that RAG-Instruct effectively enhances LLMs' RAG capabilities, achieving strong zero-shot performance and significantly outperforming various RAG baselines across a diverse set of tasks. RAG-Instruct is publicly available at https://github.com/FreedomIntelligence/RAG-Instruct. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00353v1-abstract-full').style.display = 'none'; document.getElementById('2501.00353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20727">arXiv:2412.20727</a> <span> [<a href="https://arxiv.org/pdf/2412.20727">pdf</a>, <a href="https://arxiv.org/format/2412.20727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> AverageLinear: Enhance Long-Term Time series forecasting with simple averaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Gaoxiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20727v2-abstract-short" style="display: inline;"> Long-term time series analysis aims to forecast long-term trends by examining changes over past and future periods. The intricacy of time series data poses significant challenges for modeling. Models based on the Transformer architecture, through the application of attention mechanisms to channels and sequences, have demonstrated notable performance advantages. In contrast, methods based on convol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20727v2-abstract-full').style.display = 'inline'; document.getElementById('2412.20727v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20727v2-abstract-full" style="display: none;"> Long-term time series analysis aims to forecast long-term trends by examining changes over past and future periods. The intricacy of time series data poses significant challenges for modeling. Models based on the Transformer architecture, through the application of attention mechanisms to channels and sequences, have demonstrated notable performance advantages. In contrast, methods based on convolutional neural networks or linear models often struggle to effectively handle scenarios with large number of channels. However, our research reveals that the attention mechanism is not the core component responsible for performance enhancement. We have designed an exceedingly simple linear structure AverageLinear. By employing straightforward channel embedding and averaging operations, this model can effectively capture correlations between channels while maintaining a lightweight architecture. Experimentss on real-world datasets shows that AverageLinear matches or even surpasses state-of-the-art Transformer-based structures in performance. This indicates that using purely linear structures can also endow models with robust predictive power. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20727v2-abstract-full').style.display = 'none'; document.getElementById('2412.20727v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19487">arXiv:2412.19487</a> <span> [<a href="https://arxiv.org/pdf/2412.19487">pdf</a>, <a href="https://arxiv.org/format/2412.19487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniBrain: A Unified Model for Cross-Subject Brain Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a>, <a href="/search/cs?searchtype=author&query=Nachev%2C+P">Parashkev Nachev</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19487v1-abstract-short" style="display: inline;"> Brain decoding aims to reconstruct original stimuli from fMRI signals, providing insights into interpreting mental content. Current approaches rely heavily on subject-specific models due to the complex brain processing mechanisms and the variations in fMRI signals across individuals. Therefore, these methods greatly limit the generalization of models and fail to capture cross-subject commonalities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19487v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19487v1-abstract-full" style="display: none;"> Brain decoding aims to reconstruct original stimuli from fMRI signals, providing insights into interpreting mental content. Current approaches rely heavily on subject-specific models due to the complex brain processing mechanisms and the variations in fMRI signals across individuals. Therefore, these methods greatly limit the generalization of models and fail to capture cross-subject commonalities. To address this, we present UniBrain, a unified brain decoding model that requires no subject-specific parameters. Our approach includes a group-based extractor to handle variable fMRI signal lengths, a mutual assistance embedder to capture cross-subject commonalities, and a bilevel feature alignment scheme for extracting subject-invariant features. We validate our UniBrain on the brain decoding benchmark, achieving comparable performance to current state-of-the-art subject-specific models with extremely fewer parameters. We also propose a generalization benchmark to encourage the community to emphasize cross-subject commonalities for more general brain decoding. Our code is available at https://github.com/xiaoyao3302/UniBrain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19487v1-abstract-full').style.display = 'none'; document.getElementById('2412.19487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 4 figures, 9 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19463">arXiv:2412.19463</a> <span> [<a href="https://arxiv.org/pdf/2412.19463">pdf</a>, <a href="https://arxiv.org/ps/2412.19463">ps</a>, <a href="https://arxiv.org/format/2412.19463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Laws of Quantum Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ying%2C+M">Mingsheng Ying</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Barthe%2C+G">Gilles Barthe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19463v1-abstract-short" style="display: inline;"> In this paper, we investigate the fundamental laws of quantum programming. We extend a comprehensive set of Hoare et al.'s basic laws of classical programming to the quantum setting. These laws characterise the algebraic properties of quantum programs, such as the distributivity of sequential composition over (quantum) if-statements and the unfolding of nested (quantum) if-statements. At the same… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19463v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19463v1-abstract-full" style="display: none;"> In this paper, we investigate the fundamental laws of quantum programming. We extend a comprehensive set of Hoare et al.'s basic laws of classical programming to the quantum setting. These laws characterise the algebraic properties of quantum programs, such as the distributivity of sequential composition over (quantum) if-statements and the unfolding of nested (quantum) if-statements. At the same time, we clarify some subtle differences between certain laws of classical programming and their quantum counterparts. Additionally, we derive a fixpoint characterization of quantum while-loops and a loop-based realisation of tail recursion in quantum programming. Furthermore, we establish two normal form theorems: one for quantum circuits and one for finite quantum programs. The theory in which these laws are established is formalised in the Coq proof assistant, and all of these laws are mechanically verified. As an application case of our laws, we present a formal derivation of the principle of deferred measurements in dynamic quantum circuits. We expect that these laws can be utilized in correctness-preserving transformation, compilation, and automatic code optimization in quantum programming. In particular, because these laws are formally verified in Coq, they can be confidently applied in quantum program development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19463v1-abstract-full').style.display = 'none'; document.getElementById('2412.19463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18844">arXiv:2412.18844</a> <span> [<a href="https://arxiv.org/pdf/2412.18844">pdf</a>, <a href="https://arxiv.org/format/2412.18844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Integrated Gradient-based Transferable Adversarial Examples by Refining the Integration Path </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yuchen Ren</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhengyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenhao Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lu Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhe Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18844v1-abstract-short" style="display: inline;"> Transferable adversarial examples are known to cause threats in practical, black-box attack scenarios. A notable approach to improving transferability is using integrated gradients (IG), originally developed for model interpretability. In this paper, we find that existing IG-based attacks have limited transferability due to their naive adoption of IG in model interpretability. To address this limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18844v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18844v1-abstract-full" style="display: none;"> Transferable adversarial examples are known to cause threats in practical, black-box attack scenarios. A notable approach to improving transferability is using integrated gradients (IG), originally developed for model interpretability. In this paper, we find that existing IG-based attacks have limited transferability due to their naive adoption of IG in model interpretability. To address this limitation, we focus on the IG integration path and refine it in three aspects: multiplicity, monotonicity, and diversity, supported by theoretical analyses. We propose the Multiple Monotonic Diversified Integrated Gradients (MuMoDIG) attack, which can generate highly transferable adversarial examples on different CNN and ViT models and defenses. Experiments validate that MuMoDIG outperforms the latest IG-based attack by up to 37.3\% and other state-of-the-art attacks by 8.4\%. In general, our study reveals that migrating established techniques to improve transferability may require non-trivial efforts. Code is available at \url{https://github.com/RYC-98/MuMoDIG}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18844v1-abstract-full').style.display = 'none'; document.getElementById('2412.18844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16809">arXiv:2412.16809</a> <span> [<a href="https://arxiv.org/pdf/2412.16809">pdf</a>, <a href="https://arxiv.org/format/2412.16809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeoTexDensifier: Geometry-Texture-Aware Densification for High-Quality Photorealistic 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+X">Xiaojun Xiang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Han Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongjie Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Liyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guofeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16809v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has recently attracted wide attentions in various areas such as 3D navigation, Virtual Reality (VR) and 3D simulation, due to its photorealistic and efficient rendering performance. High-quality reconstrution of 3DGS relies on sufficient splats and a reasonable distribution of these splats to fit real geometric surface and texture details, which turns out to be a chall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16809v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16809v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has recently attracted wide attentions in various areas such as 3D navigation, Virtual Reality (VR) and 3D simulation, due to its photorealistic and efficient rendering performance. High-quality reconstrution of 3DGS relies on sufficient splats and a reasonable distribution of these splats to fit real geometric surface and texture details, which turns out to be a challenging problem. We present GeoTexDensifier, a novel geometry-texture-aware densification strategy to reconstruct high-quality Gaussian splats which better comply with the geometric structure and texture richness of the scene. Specifically, our GeoTexDensifier framework carries out an auxiliary texture-aware densification method to produce a denser distribution of splats in fully textured areas, while keeping sparsity in low-texture regions to maintain the quality of Gaussian point cloud. Meanwhile, a geometry-aware splitting strategy takes depth and normal priors to guide the splitting sampling and filter out the noisy splats whose initial positions are far from the actual geometric surfaces they aim to fit, under a Validation of Depth Ratio Change checking. With the help of relative monocular depth prior, such geometry-aware validation can effectively reduce the influence of scattered Gaussians to the final rendering quality, especially in regions with weak textures or without sufficient training views. The texture-aware densification and geometry-aware splitting strategies are fully combined to obtain a set of high-quality Gaussian splats. We experiment our GeoTexDensifier framework on various datasets and compare our Novel View Synthesis results to other state-of-the-art 3DGS approaches, with detailed quantitative and qualitative evaluations to demonstrate the effectiveness of our method in producing more photorealistic 3DGS models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16809v1-abstract-full').style.display = 'none'; document.getElementById('2412.16809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16739">arXiv:2412.16739</a> <span> [<a href="https://arxiv.org/pdf/2412.16739">pdf</a>, <a href="https://arxiv.org/format/2412.16739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UNEM: UNrolled Generalized EM for Transductive Few-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Long Zhou</a>, <a href="/search/cs?searchtype=author&query=Shakeri%2C+F">Fereshteh Shakeri</a>, <a href="/search/cs?searchtype=author&query=Sadraoui%2C+A">Aymen Sadraoui</a>, <a href="/search/cs?searchtype=author&query=Kaaniche%2C+M">Mounir Kaaniche</a>, <a href="/search/cs?searchtype=author&query=Pesquet%2C+J">Jean-Christophe Pesquet</a>, <a href="/search/cs?searchtype=author&query=Ayed%2C+I+B">Ismail Ben Ayed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16739v1-abstract-short" style="display: inline;"> Transductive few-shot learning has recently triggered wide attention in computer vision. Yet, current methods introduce key hyper-parameters, which control the prediction statistics of the test batches, such as the level of class balance, affecting performances significantly. Such hyper-parameters are empirically grid-searched over validation data, and their configurations may vary substantially w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16739v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16739v1-abstract-full" style="display: none;"> Transductive few-shot learning has recently triggered wide attention in computer vision. Yet, current methods introduce key hyper-parameters, which control the prediction statistics of the test batches, such as the level of class balance, affecting performances significantly. Such hyper-parameters are empirically grid-searched over validation data, and their configurations may vary substantially with the target dataset and pre-training model, making such empirical searches both sub-optimal and computationally intractable. In this work, we advocate and introduce the unrolling paradigm, also referred to as "learning to optimize", in the context of few-shot learning, thereby learning efficiently and effectively a set of optimized hyper-parameters. Specifically, we unroll a generalization of the ubiquitous Expectation-Maximization (EM) optimizer into a neural network architecture, mapping each of its iterates to a layer and learning a set of key hyper-parameters over validation data. Our unrolling approach covers various statistical feature distributions and pre-training paradigms, including recent foundational vision-language models and standard vision-only classifiers. We report comprehensive experiments, which cover a breadth of fine-grained downstream image classification tasks, showing significant gains brought by the proposed unrolled EM algorithm over iterative variants. The achieved improvements reach up to 10% and 7.5% on vision-only and vision-language benchmarks, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16739v1-abstract-full').style.display = 'none'; document.getElementById('2412.16739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16502">arXiv:2412.16502</a> <span> [<a href="https://arxiv.org/pdf/2412.16502">pdf</a>, <a href="https://arxiv.org/format/2412.16502">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Spatial-Temporal Knowledge Distillation for Takeaway Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Boyan Shi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Liyong Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shuohao Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Huaiyu Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16502v2-abstract-short" style="display: inline;"> The takeaway recommendation system aims to recommend users' future takeaway purchases based on their historical purchase behaviors, thereby improving user satisfaction and boosting merchant sales. Existing methods focus on incorporating auxiliary information or leveraging knowledge graphs to alleviate the sparsity issue of user purchase sequences. However, two main challenges limit the performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16502v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16502v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16502v2-abstract-full" style="display: none;"> The takeaway recommendation system aims to recommend users' future takeaway purchases based on their historical purchase behaviors, thereby improving user satisfaction and boosting merchant sales. Existing methods focus on incorporating auxiliary information or leveraging knowledge graphs to alleviate the sparsity issue of user purchase sequences. However, two main challenges limit the performance of these approaches: (1) capturing dynamic user preferences on complex geospatial information and (2) efficiently integrating spatial-temporal knowledge from both graphs and sequence data with low computational costs. In this paper, we propose a novel spatial-temporal knowledge distillation model for takeaway recommendation (STKDRec) based on the two-stage training process. Specifically, during the first pre-training stage, a spatial-temporal knowledge graph (STKG) encoder is trained to extract high-order spatial-temporal dependencies and collaborative associations from the STKG. During the second spatial-temporal knowledge distillation (STKD) stage, a spatial-temporal Transformer (ST-Transformer) is employed to comprehensively model dynamic user preferences on various types of fine-grained geospatial information from a sequential perspective. Furthermore, the STKD strategy is introduced to transfer graph-based spatial-temporal knowledge to the ST-Transformer, facilitating the adaptive fusion of rich knowledge derived from both the STKG and sequence data while reducing computational overhead. Extensive experiments on three real-world datasets show that STKDRec significantly outperforms the state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16502v2-abstract-full').style.display = 'none'; document.getElementById('2412.16502v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14018">arXiv:2412.14018</a> <span> [<a href="https://arxiv.org/pdf/2412.14018">pdf</a>, <a href="https://arxiv.org/format/2412.14018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SurgSora: Decoupled RGBD-Flow Diffusion Model for Controllable Surgical Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuya Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Long Bai</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hongliang Ren</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14018v1-abstract-short" style="display: inline;"> Medical video generation has transformative potential for enhancing surgical understanding and pathology insights through precise and controllable visual representations. However, current models face limitations in controllability and authenticity. To bridge this gap, we propose SurgSora, a motion-controllable surgical video generation framework that uses a single input frame and user-controllable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14018v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14018v1-abstract-full" style="display: none;"> Medical video generation has transformative potential for enhancing surgical understanding and pathology insights through precise and controllable visual representations. However, current models face limitations in controllability and authenticity. To bridge this gap, we propose SurgSora, a motion-controllable surgical video generation framework that uses a single input frame and user-controllable motion cues. SurgSora consists of three key modules: the Dual Semantic Injector (DSI), which extracts object-relevant RGB and depth features from the input frame and integrates them with segmentation cues to capture detailed spatial features of complex anatomical structures; the Decoupled Flow Mapper (DFM), which fuses optical flow with semantic-RGB-D features at multiple scales to enhance temporal understanding and object spatial dynamics; and the Trajectory Controller (TC), which allows users to specify motion directions and estimates sparse optical flow, guiding the video generation process. The fused features are used as conditions for a frozen Stable Diffusion model to produce realistic, temporally coherent surgical videos. Extensive evaluations demonstrate that SurgSora outperforms state-of-the-art methods in controllability and authenticity, showing its potential to advance surgical video generation for medical education, training, and research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14018v1-abstract-full').style.display = 'none'; document.getElementById('2412.14018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11732">arXiv:2412.11732</a> <span> [<a href="https://arxiv.org/pdf/2412.11732">pdf</a>, <a href="https://arxiv.org/format/2412.11732">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Findings of the WMT 2024 Shared Task on Discourse-Level Literary Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siyou Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chenyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenxiang Jiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xing Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiahao Xu</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yan Gu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Minghao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Liting Zhou</a>, <a href="/search/cs?searchtype=author&query=Koehn%2C+P">Philipp Koehn</a>, <a href="/search/cs?searchtype=author&query=Way%2C+A">Andy Way</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yulin Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11732v1-abstract-short" style="display: inline;"> Following last year, we have continued to host the WMT translation shared task this year, the second edition of the Discourse-Level Literary Translation. We focus on three language directions: Chinese-English, Chinese-German, and Chinese-Russian, with the latter two ones newly added. This year, we totally received 10 submissions from 5 academia and industry teams. We employ both automatic and huma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11732v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11732v1-abstract-full" style="display: none;"> Following last year, we have continued to host the WMT translation shared task this year, the second edition of the Discourse-Level Literary Translation. We focus on three language directions: Chinese-English, Chinese-German, and Chinese-Russian, with the latter two ones newly added. This year, we totally received 10 submissions from 5 academia and industry teams. We employ both automatic and human evaluations to measure the performance of the submitted systems. The official ranking of the systems is based on the overall human judgments. We release data, system outputs, and leaderboard at https://www2.statmt.org/wmt24/literary-translation-task.html. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11732v1-abstract-full').style.display = 'none'; document.getElementById('2412.11732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WMT2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10761">arXiv:2412.10761</a> <span> [<a href="https://arxiv.org/pdf/2412.10761">pdf</a>, <a href="https://arxiv.org/format/2412.10761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rebalanced Vision-Language Retrieval Considering Structure-Aware Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yang Yang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+W">Wenjuan Xi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinhui Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10761v1-abstract-short" style="display: inline;"> Vision-language retrieval aims to search for similar instances in one modality based on queries from another modality. The primary objective is to learn cross-modal matching representations in a latent common space. Actually, the assumption underlying cross-modal matching is modal balance, where each modality contains sufficient information to represent the others. However, noise interference and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10761v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10761v1-abstract-full" style="display: none;"> Vision-language retrieval aims to search for similar instances in one modality based on queries from another modality. The primary objective is to learn cross-modal matching representations in a latent common space. Actually, the assumption underlying cross-modal matching is modal balance, where each modality contains sufficient information to represent the others. However, noise interference and modality insufficiency often lead to modal imbalance, making it a common phenomenon in practice. The impact of imbalance on retrieval performance remains an open question. In this paper, we first demonstrate that ultimate cross-modal matching is generally sub-optimal for cross-modal retrieval when imbalanced modalities exist. The structure of instances in the common space is inherently influenced when facing imbalanced modalities, posing a challenge to cross-modal similarity measurement. To address this issue, we emphasize the importance of meaningful structure-preserved matching. Accordingly, we propose a simple yet effective method to rebalance cross-modal matching by learning structure-preserved matching representations. Specifically, we design a novel multi-granularity cross-modal matching that incorporates structure-aware distillation alongside the cross-modal matching loss. While the cross-modal matching loss constraints instance-level matching, the structure-aware distillation further regularizes the geometric consistency between learned matching representations and intra-modal representations through the developed relational matching. Extensive experiments on different datasets affirm the superior cross-modal retrieval performance of our approach, simultaneously enhancing single-modal retrieval capabilities compared to the baseline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10761v1-abstract-full').style.display = 'none'; document.getElementById('2412.10761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07255">arXiv:2412.07255</a> <span> [<a href="https://arxiv.org/pdf/2412.07255">pdf</a>, <a href="https://arxiv.org/format/2412.07255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Label-Confidence-Aware Uncertainty Estimation in Natural Language Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qinhong Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linna Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhongliang Yang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuang Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07255v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) display formidable capabilities in generative tasks but also pose potential risks due to their tendency to generate hallucinatory responses. Uncertainty Quantification (UQ), the evaluation of model output reliability, is crucial for ensuring the safety and robustness of AI systems. Recent studies have concentrated on model uncertainty by analyzing the relationship betw… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07255v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07255v1-abstract-full" style="display: none;"> Large Language Models (LLMs) display formidable capabilities in generative tasks but also pose potential risks due to their tendency to generate hallucinatory responses. Uncertainty Quantification (UQ), the evaluation of model output reliability, is crucial for ensuring the safety and robustness of AI systems. Recent studies have concentrated on model uncertainty by analyzing the relationship between output entropy under various sampling conditions and the corresponding labels. However, these methods primarily focus on measuring model entropy with precision to capture response characteristics, often neglecting the uncertainties associated with greedy decoding results-the sources of model labels, which can lead to biased classification outcomes. In this paper, we explore the biases introduced by greedy decoding and propose a label-confidence-aware (LCA) uncertainty estimation based on Kullback-Leibler (KL) divergence bridging between samples and label source, thus enhancing the reliability and stability of uncertainty assessments. Our empirical evaluations across a range of popular LLMs and NLP datasets reveal that different label sources can indeed affect classification, and that our approach can effectively capture differences in sampling results and label sources, demonstrating more effective uncertainty estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07255v1-abstract-full').style.display = 'none'; document.getElementById('2412.07255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07140">arXiv:2412.07140</a> <span> [<a href="https://arxiv.org/pdf/2412.07140">pdf</a>, <a href="https://arxiv.org/format/2412.07140">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FIRE: Robust Detection of Diffusion-Generated Images via Frequency-Guided Reconstruction Error </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+B">Beilin Chu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuan Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yufei Zhang</a>, <a href="/search/cs?searchtype=author&query=You%2C+W">Weike You</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Linna Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07140v2-abstract-short" style="display: inline;"> The rapid advancement of diffusion models has significantly improved high-quality image generation, making generated content increasingly challenging to distinguish from real images and raising concerns about potential misuse. In this paper, we observe that diffusion models struggle to accurately reconstruct mid-band frequency information in real images, suggesting the limitation could serve as a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07140v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07140v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07140v2-abstract-full" style="display: none;"> The rapid advancement of diffusion models has significantly improved high-quality image generation, making generated content increasingly challenging to distinguish from real images and raising concerns about potential misuse. In this paper, we observe that diffusion models struggle to accurately reconstruct mid-band frequency information in real images, suggesting the limitation could serve as a cue for detecting diffusion model generated images. Motivated by this observation, we propose a novel method called Frequency-guided Reconstruction Error (FIRE), which, to the best of our knowledge, is the first to investigate the influence of frequency decomposition on reconstruction error. FIRE assesses the variation in reconstruction error before and after the frequency decomposition, offering a robust method for identifying diffusion model generated images. Extensive experiments show that FIRE generalizes effectively to unseen diffusion models and maintains robustness against diverse perturbations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07140v2-abstract-full').style.display = 'none'; document.getElementById('2412.07140v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05596">arXiv:2412.05596</a> <span> [<a href="https://arxiv.org/pdf/2412.05596">pdf</a>, <a href="https://arxiv.org/format/2412.05596">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TB-HSU: Hierarchical 3D Scene Understanding with Contextual Affordances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenting Xu</a>, <a href="/search/cs?searchtype=author&query=Ila%2C+V">Viorela Ila</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Luping Zhou</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C+T">Craig T. Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05596v2-abstract-short" style="display: inline;"> The concept of function and affordance is a critical aspect of 3D scene understanding and supports task-oriented objectives. In this work, we develop a model that learns to structure and vary functional affordance across a 3D hierarchical scene graph representing the spatial organization of a scene. The varying functional affordance is designed to integrate with the varying spatial context of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05596v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05596v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05596v2-abstract-full" style="display: none;"> The concept of function and affordance is a critical aspect of 3D scene understanding and supports task-oriented objectives. In this work, we develop a model that learns to structure and vary functional affordance across a 3D hierarchical scene graph representing the spatial organization of a scene. The varying functional affordance is designed to integrate with the varying spatial context of the graph. More specifically, we develop an algorithm that learns to construct a 3D hierarchical scene graph (3DHSG) that captures the spatial organization of the scene. Starting from segmented object point clouds and object semantic labels, we develop a 3DHSG with a top node that identifies the room label, child nodes that define local spatial regions inside the room with region-specific affordances, and grand-child nodes indicating object locations and object-specific affordances. To support this work, we create a custom 3DHSG dataset that provides ground truth data for local spatial regions with region-specific affordances and also object-specific affordances for each object. We employ a transformer-based model to learn the 3DHSG. We use a multi-task learning framework that learns both room classification and learns to define spatial regions within the room with region-specific affordances. Our work improves on the performance of state-of-the-art baseline models and shows one approach for applying transformer models to 3D scene understanding and the generation of 3DHSGs that capture the spatial organization of a room. The code and dataset are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05596v2-abstract-full').style.display = 'none'; document.getElementById('2412.05596v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05322">arXiv:2412.05322</a> <span> [<a href="https://arxiv.org/pdf/2412.05322">pdf</a>, <a href="https://arxiv.org/format/2412.05322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> $蟻$-NeRF: Leveraging Attenuation Priors in Neural Radiance Field for 3D Computed Tomography Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Changsheng Fang</a>, <a href="/search/cs?searchtype=author&query=Morovati%2C+B">Bahareh Morovati</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongtong Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shuo Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongshun Xu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hengyong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05322v1-abstract-short" style="display: inline;"> This paper introduces $蟻$-NeRF, a self-supervised approach that sets a new standard in novel view synthesis (NVS) and computed tomography (CT) reconstruction by modeling a continuous volumetric radiance field enriched with physics-based attenuation priors. The $蟻$-NeRF represents a three-dimensional (3D) volume through a fully-connected neural network that takes a single continuous four-dimensiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05322v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05322v1-abstract-full" style="display: none;"> This paper introduces $蟻$-NeRF, a self-supervised approach that sets a new standard in novel view synthesis (NVS) and computed tomography (CT) reconstruction by modeling a continuous volumetric radiance field enriched with physics-based attenuation priors. The $蟻$-NeRF represents a three-dimensional (3D) volume through a fully-connected neural network that takes a single continuous four-dimensional (4D) coordinate, spatial location $(x, y, z)$ and an initialized attenuation value ($蟻$), and outputs the attenuation coefficient at that position. By querying these 4D coordinates along X-ray paths, the classic forward projection technique is applied to integrate attenuation data across the 3D space. By matching and refining pre-initialized attenuation values derived from traditional reconstruction algorithms like Feldkamp-Davis-Kress algorithm (FDK) or conjugate gradient least squares (CGLS), the enriched schema delivers superior fidelity in both projection synthesis and image recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05322v1-abstract-full').style.display = 'none'; document.getElementById('2412.05322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper was submitted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03961">arXiv:2412.03961</a> <span> [<a href="https://arxiv.org/pdf/2412.03961">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Electronic Health Records-Based Data-Driven Diabetes Knowledge Unveiling and Risk Prognosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+H">Huadong Pang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiping Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+D">Dian Gu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+T">Tianyi Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hansong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03961v1-abstract-short" style="display: inline;"> In the healthcare sector, the application of deep learning technologies has revolutionized data analysis and disease forecasting. This is particularly evident in the field of diabetes, where the deep analysis of Electronic Health Records (EHR) has unlocked new opportunities for early detection and effective intervention strategies. Our research presents an innovative model that synergizes the capa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03961v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03961v1-abstract-full" style="display: none;"> In the healthcare sector, the application of deep learning technologies has revolutionized data analysis and disease forecasting. This is particularly evident in the field of diabetes, where the deep analysis of Electronic Health Records (EHR) has unlocked new opportunities for early detection and effective intervention strategies. Our research presents an innovative model that synergizes the capabilities of Bidirectional Long Short-Term Memory Networks-Conditional Random Field (BiLSTM-CRF) with a fusion of XGBoost and Logistic Regression. This model is designed to enhance the accuracy of diabetes risk prediction by conducting an in-depth analysis of electronic medical records data. The first phase of our approach involves employing BiLSTM-CRF to delve into the temporal characteristics and latent patterns present in EHR data. This method effectively uncovers the progression trends of diabetes, which are often hidden in the complex data structures of medical records. The second phase leverages the combined strength of XGBoost and Logistic Regression to classify these extracted features and evaluate associated risks. This dual approach facilitates a more nuanced and precise prediction of diabetes, outperforming traditional models, particularly in handling multifaceted and nonlinear medical datasets. Our research demonstrates a notable advancement in diabetes prediction over traditional methods, showcasing the effectiveness of our combined BiLSTM-CRF, XGBoost, and Logistic Regression model. This study highlights the value of data-driven strategies in clinical decision-making, equipping healthcare professionals with precise tools for early detection and intervention. By enabling personalized treatment and timely care, our approach signifies progress in incorporating advanced analytics in healthcare, potentially improving outcomes for diabetes and other chronic conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03961v1-abstract-full').style.display = 'none'; document.getElementById('2412.03961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02855">arXiv:2412.02855</a> <span> [<a href="https://arxiv.org/pdf/2412.02855">pdf</a>, <a href="https://arxiv.org/format/2412.02855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Optimized CNNs for Rapid 3D Point Cloud Object Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lyu%2C+T">Tianyi Lyu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+D">Dian Gu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Peiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yaoting Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+H">Huadong Pang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Li Zhou</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yiping Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02855v1-abstract-short" style="display: inline;"> This study introduces a method for efficiently detecting objects within 3D point clouds using convolutional neural networks (CNNs). Our approach adopts a unique feature-centric voting mechanism to construct convolutional layers that capitalize on the typical sparsity observed in input data. We explore the trade-off between accuracy and speed across diverse network architectures and advocate for in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02855v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02855v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02855v1-abstract-full" style="display: none;"> This study introduces a method for efficiently detecting objects within 3D point clouds using convolutional neural networks (CNNs). Our approach adopts a unique feature-centric voting mechanism to construct convolutional layers that capitalize on the typical sparsity observed in input data. We explore the trade-off between accuracy and speed across diverse network architectures and advocate for integrating an $\mathcal{L}_1$ penalty on filter activations to augment sparsity within intermediate layers. This research pioneers the proposal of sparse convolutional layers combined with $\mathcal{L}_1$ regularization to effectively handle large-scale 3D data processing. Our method's efficacy is demonstrated on the MVTec 3D-AD object detection benchmark. The Vote3Deep models, with just three layers, outperform the previous state-of-the-art in both laser-only approaches and combined laser-vision methods. Additionally, they maintain competitive processing speeds. This underscores our approach's capability to substantially enhance detection performance while ensuring computational efficiency suitable for real-time applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02855v1-abstract-full').style.display = 'none'; document.getElementById('2412.02855v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhou%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository