Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 542 results for author: <span class="mathjax">Xia, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Xia%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xia, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xia%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xia, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17028">arXiv:2502.17028</a> <span> [<a href="https://arxiv.org/pdf/2502.17028">pdf</a>, <a href="https://arxiv.org/format/2502.17028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Distributional Vision-Language Alignment by Cauchy-Schwarz Divergence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenzhe Yin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shujian Yu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Sonke%2C+J">Jan-Jakob Sonke</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17028v1-abstract-short" style="display: inline;"> Multimodal alignment is crucial for various downstream tasks such as cross-modal generation and retrieval. Previous multimodal approaches like CLIP maximize the mutual information mainly by aligning pairwise samples across modalities while overlooking the distributional differences, leading to suboptimal alignment with modality gaps. In this paper, to overcome the limitation, we propose CS-Aligner… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17028v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17028v1-abstract-full" style="display: none;"> Multimodal alignment is crucial for various downstream tasks such as cross-modal generation and retrieval. Previous multimodal approaches like CLIP maximize the mutual information mainly by aligning pairwise samples across modalities while overlooking the distributional differences, leading to suboptimal alignment with modality gaps. In this paper, to overcome the limitation, we propose CS-Aligner, a novel and straightforward framework that performs distributional vision-language alignment by integrating Cauchy-Schwarz (CS) divergence with mutual information. In the proposed framework, we find that the CS divergence and mutual information serve complementary roles in multimodal alignment, capturing both the global distribution information of each modality and the pairwise semantic relationships, yielding tighter and more precise alignment. Moreover, CS-Aligher enables incorporating additional information from unpaired data and token-level representations, enhancing flexible and fine-grained alignment in practice. Experiments on text-to-image generation and cross-modality retrieval tasks demonstrate the effectiveness of our method on vision-language alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17028v1-abstract-full').style.display = 'none'; document.getElementById('2502.17028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13646">arXiv:2502.13646</a> <span> [<a href="https://arxiv.org/pdf/2502.13646">pdf</a>, <a href="https://arxiv.org/format/2502.13646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> D.Va: Validate Your Demonstration First Before You Use It </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiqing Xiao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Ruixuan Xiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+L">Lirong Gao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junbo Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13646v1-abstract-short" style="display: inline;"> In-context learning (ICL) has demonstrated significant potential in enhancing the capabilities of large language models (LLMs) during inference. It's well-established that ICL heavily relies on selecting effective demonstrations to generate outputs that better align with the expected results. As for demonstration selection, previous approaches have typically relied on intuitive metrics to evaluate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13646v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13646v1-abstract-full" style="display: none;"> In-context learning (ICL) has demonstrated significant potential in enhancing the capabilities of large language models (LLMs) during inference. It's well-established that ICL heavily relies on selecting effective demonstrations to generate outputs that better align with the expected results. As for demonstration selection, previous approaches have typically relied on intuitive metrics to evaluate the effectiveness of demonstrations, which often results in limited robustness and poor cross-model generalization capabilities. To tackle these challenges, we propose a novel method, \textbf{D}emonstration \textbf{VA}lidation (\textbf{D.Va}), which integrates a demonstration validation perspective into this field. By introducing the demonstration validation mechanism, our method effectively identifies demonstrations that are both effective and highly generalizable. \textbf{D.Va} surpasses all existing demonstration selection techniques across both natural language understanding (NLU) and natural language generation (NLG) tasks. Additionally, we demonstrate the robustness and generalizability of our approach across various language models with different retrieval models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13646v1-abstract-full').style.display = 'none'; document.getElementById('2502.13646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12195">arXiv:2502.12195</a> <span> [<a href="https://arxiv.org/pdf/2502.12195">pdf</a>, <a href="https://arxiv.org/format/2502.12195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GeneralizeFormer: Layer-Adaptive Model Generation across Test-Time Distribution Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ambekar%2C+S">Sameer Ambekar</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12195v1-abstract-short" style="display: inline;"> We consider the problem of test-time domain generalization, where a model is trained on several source domains and adjusted on target domains never seen during training. Different from the common methods that fine-tune the model or adjust the classifier parameters online, we propose to generate multiple layer parameters on the fly during inference by a lightweight meta-learned transformer, which w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12195v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12195v1-abstract-full" style="display: none;"> We consider the problem of test-time domain generalization, where a model is trained on several source domains and adjusted on target domains never seen during training. Different from the common methods that fine-tune the model or adjust the classifier parameters online, we propose to generate multiple layer parameters on the fly during inference by a lightweight meta-learned transformer, which we call \textit{GeneralizeFormer}. The layer-wise parameters are generated per target batch without fine-tuning or online adjustment. By doing so, our method is more effective in dynamic scenarios with multiple target distributions and also avoids forgetting valuable source distribution characteristics. Moreover, by considering layer-wise gradients, the proposed method adapts itself to various distribution shifts. To reduce the computational and time cost, we fix the convolutional parameters while only generating parameters of the Batch Normalization layers and the linear classifier. Experiments on six widely used domain generalization datasets demonstrate the benefits and abilities of the proposed method to efficiently handle various distribution shifts, generalize in dynamic scenarios, and avoid forgetting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12195v1-abstract-full').style.display = 'none'; document.getElementById('2502.12195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12109">arXiv:2502.12109</a> <span> [<a href="https://arxiv.org/pdf/2502.12109">pdf</a>, <a href="https://arxiv.org/format/2502.12109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personality Structured Interview for Large Language Model Simulation in Personality Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengda Wang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+H">Huiqi Zou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hanjie Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+T">Tianjun Sun</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Ziang Xiao</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+F+L">Frederick L. Oswald</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12109v1-abstract-short" style="display: inline;"> Although psychometrics researchers have recently explored the use of large language models (LLMs) as proxies for human participants, LLMs often fail to generate heterogeneous data with human-like diversity, which diminishes their value in advancing social science research. To address these challenges, we explored the potential of the theory-informed Personality Structured Interview (PSI) as a tool… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12109v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12109v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12109v1-abstract-full" style="display: none;"> Although psychometrics researchers have recently explored the use of large language models (LLMs) as proxies for human participants, LLMs often fail to generate heterogeneous data with human-like diversity, which diminishes their value in advancing social science research. To address these challenges, we explored the potential of the theory-informed Personality Structured Interview (PSI) as a tool for simulating human responses in personality research. In this approach, the simulation is grounded in nuanced real-human interview transcripts that target the personality construct of interest. We have provided a growing set of 357 structured interview transcripts from a representative sample, each containing an individual's response to 32 open-ended questions carefully designed to gather theory-based personality evidence. Additionally, grounded in psychometric research, we have summarized an evaluation framework to systematically validate LLM-generated psychometric data. Results from three experiments demonstrate that well-designed structured interviews could improve human-like heterogeneity in LLM-simulated personality data and predict personality-related behavioral outcomes (i.e., organizational citizenship behaviors and counterproductive work behavior). We further discuss the role of theory-informed structured interviews in LLM-based simulation and outline a general framework for designing structured interviews to simulate human-like data for psychometric research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12109v1-abstract-full').style.display = 'none'; document.getElementById('2502.12109v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">41 Pages, 30 Tables, 5 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11919">arXiv:2502.11919</a> <span> [<a href="https://arxiv.org/pdf/2502.11919">pdf</a>, <a href="https://arxiv.org/format/2502.11919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Text to Trust: Empowering AI-assisted Decision Making with Adaptive LLM-powered Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoyan Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hangxiao Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhuoran Lu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Ziang Xiao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Ming Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11919v1-abstract-short" style="display: inline;"> AI-assisted decision making becomes increasingly prevalent, yet individuals often fail to utilize AI-based decision aids appropriately especially when the AI explanations are absent, potentially as they do not %understand reflect on AI's decision recommendations critically. Large language models (LLMs), with their exceptional conversational and analytical capabilities, present great opportunities… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11919v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11919v1-abstract-full" style="display: none;"> AI-assisted decision making becomes increasingly prevalent, yet individuals often fail to utilize AI-based decision aids appropriately especially when the AI explanations are absent, potentially as they do not %understand reflect on AI's decision recommendations critically. Large language models (LLMs), with their exceptional conversational and analytical capabilities, present great opportunities to enhance AI-assisted decision making in the absence of AI explanations by providing natural-language-based analysis of AI's decision recommendation, e.g., how each feature of a decision making task might contribute to the AI recommendation. In this paper, via a randomized experiment, we first show that presenting LLM-powered analysis of each task feature, either sequentially or concurrently, does not significantly improve people's AI-assisted decision performance. To enable decision makers to better leverage LLM-powered analysis, we then propose an algorithmic framework to characterize the effects of LLM-powered analysis on human decisions and dynamically decide which analysis to present. Our evaluation with human subjects shows that this approach effectively improves decision makers' appropriate reliance on AI in AI-assisted decision making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11919v1-abstract-full').style.display = 'none'; document.getElementById('2502.11919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CHI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11089">arXiv:2502.11089</a> <span> [<a href="https://arxiv.org/pdf/2502.11089">pdf</a>, <a href="https://arxiv.org/format/2502.11089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huazuo Gao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhenda Xie</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y+X">Y. X. Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lean Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuqing Wang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wenfeng Liang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wangding Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11089v1-abstract-short" style="display: inline;"> Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with har… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11089v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11089v1-abstract-full" style="display: none;"> Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with hardware-aligned optimizations to achieve efficient long-context modeling. NSA employs a dynamic hierarchical sparse strategy, combining coarse-grained token compression with fine-grained token selection to preserve both global context awareness and local precision. Our approach advances sparse attention design with two key innovations: (1) We achieve substantial speedups through arithmetic intensity-balanced algorithm design, with implementation optimizations for modern hardware. (2) We enable end-to-end training, reducing pretraining computation without sacrificing model performance. As shown in Figure 1, experiments show the model pretrained with NSA maintains or exceeds Full Attention models across general benchmarks, long-context tasks, and instruction-based reasoning. Meanwhile, NSA achieves substantial speedups over Full Attention on 64k-length sequences across decoding, forward propagation, and backward propagation, validating its efficiency throughout the model lifecycle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11089v1-abstract-full').style.display = 'none'; document.getElementById('2502.11089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08547">arXiv:2502.08547</a> <span> [<a href="https://arxiv.org/pdf/2502.08547">pdf</a>, <a href="https://arxiv.org/format/2502.08547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Representation Learning to Advance Multi-institutional Studies with Electronic Health Record Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Doudou Zhou</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Han Tong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Linshanshan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Suqi Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xin Xiong</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziming Gan</a>, <a href="/search/cs?searchtype=author&query=Griffier%2C+R">Romain Griffier</a>, <a href="/search/cs?searchtype=author&query=Hejblum%2C+B">Boris Hejblum</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun-Chung Liu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C">Chuan Hong</a>, <a href="/search/cs?searchtype=author&query=Bonzel%2C+C">Clara-Lea Bonzel</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianrun Cai</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+K">Kevin Pan</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+Y">Yuk-Lam Ho</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+L">Lauren Costa</a>, <a href="/search/cs?searchtype=author&query=Panickan%2C+V+A">Vidul A. Panickan</a>, <a href="/search/cs?searchtype=author&query=Gaziano%2C+J+M">J. Michael Gaziano</a>, <a href="/search/cs?searchtype=author&query=Mandl%2C+K">Kenneth Mandl</a>, <a href="/search/cs?searchtype=author&query=Jouhet%2C+V">Vianney Jouhet</a>, <a href="/search/cs?searchtype=author&query=Thiebaut%2C+R">Rodolphe Thiebaut</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zongqi Xia</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kelly Cho</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+K">Katherine Liao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tianxi Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08547v1-abstract-short" style="display: inline;"> The adoption of EHRs has expanded opportunities to leverage data-driven algorithms in clinical care and research. A major bottleneck in effectively conducting multi-institutional EHR studies is the data heterogeneity across systems with numerous codes that either do not exist or represent different clinical concepts across institutions. The need for data privacy further limits the feasibility of i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08547v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08547v1-abstract-full" style="display: none;"> The adoption of EHRs has expanded opportunities to leverage data-driven algorithms in clinical care and research. A major bottleneck in effectively conducting multi-institutional EHR studies is the data heterogeneity across systems with numerous codes that either do not exist or represent different clinical concepts across institutions. The need for data privacy further limits the feasibility of including multi-institutional patient-level data required to study similarities and differences across patient subgroups. To address these challenges, we developed the GAME algorithm. Tested and validated across 7 institutions and 2 languages, GAME integrates data in several levels: (1) at the institutional level with knowledge graphs to establish relationships between codes and existing knowledge sources, providing the medical context for standard codes and their relationship to each other; (2) between institutions, leveraging language models to determine the relationships between institution-specific codes with established standard codes; and (3) quantifying the strength of the relationships between codes using a graph attention network. Jointly trained embeddings are created using transfer and federated learning to preserve data privacy. In this study, we demonstrate the applicability of GAME in selecting relevant features as inputs for AI-driven algorithms in a range of conditions, e.g., heart failure, rheumatoid arthritis. We then highlight the application of GAME harmonized multi-institutional EHR data in a study of Alzheimer's disease outcomes and suicide risk among patients with mental health disorders, without sharing patient-level data outside individual institutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08547v1-abstract-full').style.display = 'none'; document.getElementById('2502.08547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06238">arXiv:2502.06238</a> <span> [<a href="https://arxiv.org/pdf/2502.06238">pdf</a>, <a href="https://arxiv.org/format/2502.06238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> XNet-Enhanced Deep BSDE Method and Numerical Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaotao Zheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihong Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xingye Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06238v1-abstract-short" style="display: inline;"> Solving high-dimensional semilinear parabolic partial differential equations (PDEs) challenges traditional numerical methods due to the "curse of dimensionality." Deep learning, particularly through the Deep BSDE method, offers a promising alternative by leveraging neural networks' capability to approximate high-dimensional functions. This paper introduces a novel network architecture, XNet, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06238v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06238v1-abstract-full" style="display: none;"> Solving high-dimensional semilinear parabolic partial differential equations (PDEs) challenges traditional numerical methods due to the "curse of dimensionality." Deep learning, particularly through the Deep BSDE method, offers a promising alternative by leveraging neural networks' capability to approximate high-dimensional functions. This paper introduces a novel network architecture, XNet, which significantly enhances the computational efficiency and accuracy of the Deep BSDE method. XNet demonstrates superior approximation capabilities with fewer parameters, addressing the trade-off between approximation and optimization errors found in existing methods. We detail the implementation of XNet within the Deep BSDE framework and present results that show marked improvements in solving high-dimensional PDEs, potentially setting a new standard for such computations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06238v1-abstract-full').style.display = 'none'; document.getElementById('2502.06238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05874">arXiv:2502.05874</a> <span> [<a href="https://arxiv.org/pdf/2502.05874">pdf</a>, <a href="https://arxiv.org/format/2502.05874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MMGDreamer: Mixed-Modality Graph for Geometry-Controllable 3D Indoor Scene Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhifei Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keyang Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jiaxing Qi</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+R">Ruifei Ma</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shenglin Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+M">Mingzhe Xing</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+J">Jieyi Long</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiangde Liu</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+G">Guangyao Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05874v1-abstract-short" style="display: inline;"> Controllable 3D scene generation has extensive applications in virtual reality and interior design, where the generated scenes should exhibit high levels of realism and controllability in terms of geometry. Scene graphs provide a suitable data representation that facilitates these applications. However, current graph-based methods for scene generation are constrained to text-based inputs and exhib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05874v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05874v1-abstract-full" style="display: none;"> Controllable 3D scene generation has extensive applications in virtual reality and interior design, where the generated scenes should exhibit high levels of realism and controllability in terms of geometry. Scene graphs provide a suitable data representation that facilitates these applications. However, current graph-based methods for scene generation are constrained to text-based inputs and exhibit insufficient adaptability to flexible user inputs, hindering the ability to precisely control object geometry. To address this issue, we propose MMGDreamer, a dual-branch diffusion model for scene generation that incorporates a novel Mixed-Modality Graph, visual enhancement module, and relation predictor. The mixed-modality graph allows object nodes to integrate textual and visual modalities, with optional relationships between nodes. It enhances adaptability to flexible user inputs and enables meticulous control over the geometry of objects in the generated scenes. The visual enhancement module enriches the visual fidelity of text-only nodes by constructing visual representations using text embeddings. Furthermore, our relation predictor leverages node representations to infer absent relationships between nodes, resulting in more coherent scene layouts. Extensive experimental results demonstrate that MMGDreamer exhibits superior control of object geometry, achieving state-of-the-art scene generation performance. Project page: https://yangzhifeio.github.io/project/MMGDreamer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05874v1-abstract-full').style.display = 'none'; document.getElementById('2502.05874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025 Main Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04917">arXiv:2502.04917</a> <span> [<a href="https://arxiv.org/pdf/2502.04917">pdf</a>, <a href="https://arxiv.org/format/2502.04917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Complex Physics-Informed Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenhao Si</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+M">Ming Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihong Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04917v1-abstract-short" style="display: inline;"> We propose compleX-PINN, a novel physics-informed neural network (PINN) architecture that incorporates a learnable activation function inspired by Cauchy integral theorem. By learning the parameters of the activation function, compleX-PINN achieves high accuracy with just a single hidden layer. Empirical results show that compleX-PINN effectively solves problems where traditional PINNs struggle an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04917v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04917v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04917v1-abstract-full" style="display: none;"> We propose compleX-PINN, a novel physics-informed neural network (PINN) architecture that incorporates a learnable activation function inspired by Cauchy integral theorem. By learning the parameters of the activation function, compleX-PINN achieves high accuracy with just a single hidden layer. Empirical results show that compleX-PINN effectively solves problems where traditional PINNs struggle and consistently delivers significantly higher precision, often by an order of magnitude. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04917v1-abstract-full').style.display = 'none'; document.getElementById('2502.04917v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04728">arXiv:2502.04728</a> <span> [<a href="https://arxiv.org/pdf/2502.04728">pdf</a>, <a href="https://arxiv.org/format/2502.04728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generating Symbolic World Models via Test-time Scaling of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhouliang Yu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuhuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T+Z">Tim Z. Xiao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F+F">Fuxiang Frank Xia</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+J">Jie Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Ge Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04728v1-abstract-short" style="display: inline;"> Solving complex planning problems requires Large Language Models (LLMs) to explicitly model the state transition to avoid rule violations, comply with constraints, and ensure optimality-a task hindered by the inherent ambiguity of natural language. To overcome such ambiguity, Planning Domain Definition Language (PDDL) is leveraged as a planning abstraction that enables precise and formal state des… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04728v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04728v1-abstract-full" style="display: none;"> Solving complex planning problems requires Large Language Models (LLMs) to explicitly model the state transition to avoid rule violations, comply with constraints, and ensure optimality-a task hindered by the inherent ambiguity of natural language. To overcome such ambiguity, Planning Domain Definition Language (PDDL) is leveraged as a planning abstraction that enables precise and formal state descriptions. With PDDL, we can generate a symbolic world model where classic searching algorithms, such as A*, can be seamlessly applied to find optimal plans. However, directly generating PDDL domains with current LLMs remains an open challenge due to the lack of PDDL training data. To address this challenge, we propose to scale up the test-time computation of LLMs to enhance their PDDL reasoning capabilities, thereby enabling the generation of high-quality PDDL domains. Specifically, we introduce a simple yet effective algorithm, which first employs a Best-of-N sampling approach to improve the quality of the initial solution and then refines the solution in a fine-grained manner with verbalized machine learning. Our method outperforms o1-mini by a considerable margin in the generation of PDDL domain, achieving over 50% success rate on two tasks (i.e., generating PDDL domains from natural language description or PDDL problems). This is done without requiring additional training. By taking advantage of PDDL as state abstraction, our method is able to outperform current state-of-the-art methods on almost all competition-level planning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04728v1-abstract-full').style.display = 'none'; document.getElementById('2502.04728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report v1 (32 pages, 6 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04176">arXiv:2502.04176</a> <span> [<a href="https://arxiv.org/pdf/2502.04176">pdf</a>, <a href="https://arxiv.org/format/2502.04176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> MRAMG-Bench: A BeyondText Benchmark for Multimodal Retrieval-Augmented Multimodal Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qinhan Yu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiyou Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binghui Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengren Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04176v1-abstract-short" style="display: inline;"> Recent advancements in Retrieval-Augmented Generation (RAG) have shown remarkable performance in enhancing response accuracy and relevance by integrating external knowledge into generative models. However, existing RAG methods primarily focus on providing text-only answers, even in multimodal retrieval-augmented generation scenarios. In this work, we introduce the Multimodal Retrieval-Augmented Mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04176v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04176v1-abstract-full" style="display: none;"> Recent advancements in Retrieval-Augmented Generation (RAG) have shown remarkable performance in enhancing response accuracy and relevance by integrating external knowledge into generative models. However, existing RAG methods primarily focus on providing text-only answers, even in multimodal retrieval-augmented generation scenarios. In this work, we introduce the Multimodal Retrieval-Augmented Multimodal Generation (MRAMG) task, which aims to generate answers that combine both text and images, fully leveraging the multimodal data within a corpus. Despite the importance of this task, there is a notable absence of a comprehensive benchmark to effectively evaluate MRAMG performance. To bridge this gap, we introduce the MRAMG-Bench, a carefully curated, human-annotated dataset comprising 4,346 documents, 14,190 images, and 4,800 QA pairs, sourced from three categories: Web Data, Academic Papers, and Lifestyle. The dataset incorporates diverse difficulty levels and complex multi-image scenarios, providing a robust foundation for evaluating multimodal generation tasks. To facilitate rigorous evaluation, our MRAMG-Bench incorporates a comprehensive suite of both statistical and LLM-based metrics, enabling a thorough analysis of the performance of popular generative models in the MRAMG task. Besides, we propose an efficient multimodal answer generation framework that leverages both LLMs and MLLMs to generate multimodal responses. Our datasets are available at: https://huggingface.co/MRAMG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04176v1-abstract-full').style.display = 'none'; document.getElementById('2502.04176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02384">arXiv:2502.02384</a> <span> [<a href="https://arxiv.org/pdf/2502.02384">pdf</a>, <a href="https://arxiv.org/format/2502.02384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> STAIR: Improving Safety Alignment with Introspective Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Siyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zeyu Xia</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zhengwei Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiao Yang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Dong Yan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yinpeng Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02384v1-abstract-short" style="display: inline;"> Ensuring the safety and harmlessness of Large Language Models (LLMs) has become equally critical as their performance in applications. However, existing safety alignment methods typically suffer from safety-performance trade-offs and the susceptibility to jailbreak attacks, primarily due to their reliance on direct refusals for malicious queries. In this paper, we propose STAIR, a novel framework… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02384v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02384v1-abstract-full" style="display: none;"> Ensuring the safety and harmlessness of Large Language Models (LLMs) has become equally critical as their performance in applications. However, existing safety alignment methods typically suffer from safety-performance trade-offs and the susceptibility to jailbreak attacks, primarily due to their reliance on direct refusals for malicious queries. In this paper, we propose STAIR, a novel framework that integrates SafeTy Alignment with Itrospective Reasoning. We enable LLMs to identify safety risks through step-by-step analysis by self-improving chain-of-thought (CoT) reasoning with safety awareness. STAIR first equips the model with a structured reasoning capability and then advances safety alignment via iterative preference optimization on step-level reasoning data generated using our newly proposed Safety-Informed Monte Carlo Tree Search (SI-MCTS). We further train a process reward model on this data to guide test-time searches for improved responses. Extensive experiments show that STAIR effectively mitigates harmful outputs while better preserving helpfulness, compared to instinctive alignment strategies. With test-time scaling, STAIR achieves a safety performance comparable to Claude-3.5 against popular jailbreak attacks. Relevant resources in this work are available at https://github.com/thu-ml/STAIR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02384v1-abstract-full').style.display = 'none'; document.getElementById('2502.02384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02338">arXiv:2502.02338</a> <span> [<a href="https://arxiv.org/pdf/2502.02338">pdf</a>, <a href="https://arxiv.org/format/2502.02338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Geometric Neural Process Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenzhe Yin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunlu Chen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Sonke%2C+J">Jan-Jakob Sonke</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02338v1-abstract-short" style="display: inline;"> This paper addresses the challenge of Neural Field (NeF) generalization, where models must efficiently adapt to new signals given only a few observations. To tackle this, we propose Geometric Neural Process Fields (G-NPF), a probabilistic framework for neural radiance fields that explicitly captures uncertainty. We formulate NeF generalization as a probabilistic problem, enabling direct inference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02338v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02338v1-abstract-full" style="display: none;"> This paper addresses the challenge of Neural Field (NeF) generalization, where models must efficiently adapt to new signals given only a few observations. To tackle this, we propose Geometric Neural Process Fields (G-NPF), a probabilistic framework for neural radiance fields that explicitly captures uncertainty. We formulate NeF generalization as a probabilistic problem, enabling direct inference of NeF function distributions from limited context observations. To incorporate structural inductive biases, we introduce a set of geometric bases that encode spatial structure and facilitate the inference of NeF function distributions. Building on these bases, we design a hierarchical latent variable model, allowing G-NPF to integrate structural information across multiple spatial levels and effectively parameterize INR functions. This hierarchical approach improves generalization to novel scenes and unseen signals. Experiments on novel-view synthesis for 3D scenes, as well as 2D image and 1D signal regression, demonstrate the effectiveness of our method in capturing uncertainty and leveraging structural information for improved generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02338v1-abstract-full').style.display = 'none'; document.getElementById('2502.02338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01171">arXiv:2502.01171</a> <span> [<a href="https://arxiv.org/pdf/2502.01171">pdf</a>, <a href="https://arxiv.org/format/2502.01171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Efficient and Scalable Density Functional Theory Hamiltonian Prediction through Adaptive Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+E">Erpai Luo</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xinran Wei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lin Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunyang Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Han Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zun Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zaishuo Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jia Zhang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+B">Bin Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01171v1-abstract-short" style="display: inline;"> Hamiltonian matrix prediction is pivotal in computational chemistry, serving as the foundation for determining a wide range of molecular properties. While SE(3) equivariant graph neural networks have achieved remarkable success in this domain, their substantial computational cost-driven by high-order tensor product (TP) operations-restricts their scalability to large molecular systems with extensi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01171v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01171v1-abstract-full" style="display: none;"> Hamiltonian matrix prediction is pivotal in computational chemistry, serving as the foundation for determining a wide range of molecular properties. While SE(3) equivariant graph neural networks have achieved remarkable success in this domain, their substantial computational cost-driven by high-order tensor product (TP) operations-restricts their scalability to large molecular systems with extensive basis sets. To address this challenge, we introduce SPHNet, an efficient and scalable equivariant network that incorporates adaptive sparsity into Hamiltonian prediction. SPHNet employs two innovative sparse gates to selectively constrain non-critical interaction combinations, significantly reducing tensor product computations while maintaining accuracy. To optimize the sparse representation, we develop a Three-phase Sparsity Scheduler, ensuring stable convergence and achieving high performance at sparsity rates of up to 70 percent. Extensive evaluations on QH9 and PubchemQH datasets demonstrate that SPHNet achieves state-of-the-art accuracy while providing up to a 7x speedup over existing models. Beyond Hamiltonian prediction, the proposed sparsification techniques also hold significant potential for improving the efficiency and scalability of other SE(3) equivariant networks, further broadening their applicability and impact. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01171v1-abstract-full').style.display = 'none'; document.getElementById('2502.01171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18959">arXiv:2501.18959</a> <span> [<a href="https://arxiv.org/pdf/2501.18959">pdf</a>, <a href="https://arxiv.org/format/2501.18959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Neural Function Approximation: The XNet Outperforming KAN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaotao Zheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihong Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18959v2-abstract-short" style="display: inline;"> XNet is a single-layer neural network architecture that leverages Cauchy integral-based activation functions for high-order function approximation. Through theoretical analysis, we show that the Cauchy activation functions used in XNet can achieve arbitrary-order polynomial convergence, fundamentally outperforming traditional MLPs and Kolmogorov-Arnold Networks (KANs) that rely on increased depth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18959v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18959v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18959v2-abstract-full" style="display: none;"> XNet is a single-layer neural network architecture that leverages Cauchy integral-based activation functions for high-order function approximation. Through theoretical analysis, we show that the Cauchy activation functions used in XNet can achieve arbitrary-order polynomial convergence, fundamentally outperforming traditional MLPs and Kolmogorov-Arnold Networks (KANs) that rely on increased depth or B-spline activations. Our extensive experiments on function approximation, PDE solving, and reinforcement learning demonstrate XNet's superior performance - reducing approximation error by up to 50000 times and accelerating training by up to 10 times compared to existing approaches. These results establish XNet as a highly efficient architecture for both scientific computing and AI applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18959v2-abstract-full').style.display = 'none'; document.getElementById('2501.18959v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2410.02033</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16404">arXiv:2501.16404</a> <span> [<a href="https://arxiv.org/pdf/2501.16404">pdf</a>, <a href="https://arxiv.org/format/2501.16404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DynaPrompt: Dynamic Test-Time Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jack Hong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiayin Cai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16404v1-abstract-short" style="display: inline;"> Test-time prompt tuning enhances zero-shot generalization of vision-language models but tends to ignore the relatedness among test samples during inference. Online test-time prompt tuning provides a simple way to leverage the information in previous test samples, albeit with the risk of prompt collapse due to error accumulation. To enhance test-time prompt tuning, we propose DynaPrompt, short for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16404v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16404v1-abstract-full" style="display: none;"> Test-time prompt tuning enhances zero-shot generalization of vision-language models but tends to ignore the relatedness among test samples during inference. Online test-time prompt tuning provides a simple way to leverage the information in previous test samples, albeit with the risk of prompt collapse due to error accumulation. To enhance test-time prompt tuning, we propose DynaPrompt, short for dynamic test-time prompt tuning, exploiting relevant data distribution information while reducing error accumulation. Built on an online prompt buffer, DynaPrompt adaptively selects and optimizes the relevant prompts for each test sample during tuning. Specifically, we introduce a dynamic prompt selection strategy based on two metrics: prediction entropy and probability difference. For unseen test data information, we develop dynamic prompt appending, which allows the buffer to append new prompts and delete the inactive ones. By doing so, the prompts are optimized to exploit beneficial information on specific test data, while alleviating error accumulation. Experiments on fourteen datasets demonstrate the effectiveness of dynamic test-time prompt tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16404v1-abstract-full').style.display = 'none'; document.getElementById('2501.16404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14492">arXiv:2501.14492</a> <span> [<a href="https://arxiv.org/pdf/2501.14492">pdf</a>, <a href="https://arxiv.org/format/2501.14492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> RealCritic: Towards Effectiveness-Driven Evaluation of Language Model Critiques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziniu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tian Ding</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14492v1-abstract-short" style="display: inline;"> Critiques are important for enhancing the performance of Large Language Models (LLMs), enabling both self-improvement and constructive feedback for others by identifying flaws and suggesting improvements. However, evaluating the critique capabilities of LLMs presents a significant challenge due to the open-ended nature of the task. In this work, we introduce a new benchmark designed to assess the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14492v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14492v1-abstract-full" style="display: none;"> Critiques are important for enhancing the performance of Large Language Models (LLMs), enabling both self-improvement and constructive feedback for others by identifying flaws and suggesting improvements. However, evaluating the critique capabilities of LLMs presents a significant challenge due to the open-ended nature of the task. In this work, we introduce a new benchmark designed to assess the critique capabilities of LLMs. Unlike existing benchmarks, which typically function in an open-loop fashion, our approach employs a closed-loop methodology that evaluates the quality of corrections generated from critiques. Moreover, the benchmark incorporates features such as self-critique, cross-critique, and iterative critique, which are crucial for distinguishing the abilities of advanced reasoning models from more classical ones. We implement this benchmark using eight challenging reasoning tasks. We have several interesting findings. First, despite demonstrating comparable performance in direct chain-of-thought generation, classical LLMs significantly lag behind the advanced reasoning-based model o1-mini across all critique scenarios. Second, in self-critique and iterative critique settings, classical LLMs may even underperform relative to their baseline capabilities. We hope that this benchmark will serve as a valuable resource to guide future advancements. The code and data are available at \url{https://github.com/tangzhy/RealCritic}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14492v1-abstract-full').style.display = 'none'; document.getElementById('2501.14492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13397">arXiv:2501.13397</a> <span> [<a href="https://arxiv.org/pdf/2501.13397">pdf</a>, <a href="https://arxiv.org/format/2501.13397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ExLM: Rethinking the Impact of [MASK] Tokens in Masked Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kangjie Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyue Liang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13397v4-abstract-short" style="display: inline;"> Masked Language Models (MLMs) have achieved remarkable success in many self-supervised representation learning tasks. MLMs are trained by randomly masking portions of the input sequences with [MASK] tokens and learning to reconstruct the original content based on the remaining context. This paper explores the impact of [MASK] tokens on MLMs. Analytical studies show that masking tokens can introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13397v4-abstract-full').style.display = 'inline'; document.getElementById('2501.13397v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13397v4-abstract-full" style="display: none;"> Masked Language Models (MLMs) have achieved remarkable success in many self-supervised representation learning tasks. MLMs are trained by randomly masking portions of the input sequences with [MASK] tokens and learning to reconstruct the original content based on the remaining context. This paper explores the impact of [MASK] tokens on MLMs. Analytical studies show that masking tokens can introduce the corrupted semantics problem, wherein the corrupted context may convey multiple, ambiguous meanings. This problem is also a key factor affecting the performance of MLMs on downstream tasks. Based on these findings, we propose a novel enhanced-context MLM, ExLM. Our approach expands [MASK] tokens in the input context and models the dependencies between these expanded states. This enhancement increases context capacity and enables the model to capture richer semantic information, effectively mitigating the corrupted semantics problem during pre-training. Experimental results demonstrate that ExLM achieves significant performance improvements in both text modeling and SMILES modeling tasks. Further analysis confirms that ExLM enriches semantic representations through context enhancement, and effectively reduces the semantic multimodality commonly observed in MLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13397v4-abstract-full').style.display = 'none'; document.getElementById('2501.13397v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12557">arXiv:2501.12557</a> <span> [<a href="https://arxiv.org/pdf/2501.12557">pdf</a>, <a href="https://arxiv.org/format/2501.12557">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Understanding the LLM-ification of CHI: Unpacking the Impact of LLMs at CHI through a Systematic Literature Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Rock Yuren Pang</a>, <a href="/search/cs?searchtype=author&query=Schroeder%2C+H">Hope Schroeder</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+K+S">Kynnedy Simone Smith</a>, <a href="/search/cs?searchtype=author&query=Barocas%2C+S">Solon Barocas</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Ziang Xiao</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+E">Emily Tseng</a>, <a href="/search/cs?searchtype=author&query=Bragg%2C+D">Danielle Bragg</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12557v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been positioned to revolutionize HCI, by reshaping not only the interfaces, design patterns, and sociotechnical systems that we study, but also the research practices we use. To-date, however, there has been little understanding of LLMs' uptake in HCI. We address this gap via a systematic literature review of 153 CHI papers from 2020-24 that engage with LLMs. We t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12557v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12557v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12557v1-abstract-full" style="display: none;"> Large language models (LLMs) have been positioned to revolutionize HCI, by reshaping not only the interfaces, design patterns, and sociotechnical systems that we study, but also the research practices we use. To-date, however, there has been little understanding of LLMs' uptake in HCI. We address this gap via a systematic literature review of 153 CHI papers from 2020-24 that engage with LLMs. We taxonomize: (1) domains where LLMs are applied; (2) roles of LLMs in HCI projects; (3) contribution types; and (4) acknowledged limitations and risks. We find LLM work in 10 diverse domains, primarily via empirical and artifact contributions. Authors use LLMs in five distinct roles, including as research tools or simulated users. Still, authors often raise validity and reproducibility concerns, and overwhelmingly study closed models. We outline opportunities to improve HCI research with and on LLMs, and provide guiding questions for researchers to consider the validity and appropriateness of LLM-related work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12557v1-abstract-full').style.display = 'none'; document.getElementById('2501.12557v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is a preprint version of the paper conditionally accepted to CHI'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11508">arXiv:2501.11508</a> <span> [<a href="https://arxiv.org/pdf/2501.11508">pdf</a>, <a href="https://arxiv.org/format/2501.11508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> See In Detail: Enhancing Sparse-view 3D Gaussian Splatting with Local Depth and Semantic Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongqi He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhe Xiao</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kin-Chung Chan</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yushen Zuo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kin-Man Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11508v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view synthesis. However, its rendering quality deteriorates with sparse inphut views, leading to distorted content and reduced details. This limitation hinders its practical application. To address this issue, we propose a sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view rendering, incorporating pri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11508v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11508v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has shown remarkable performance in novel view synthesis. However, its rendering quality deteriorates with sparse inphut views, leading to distorted content and reduced details. This limitation hinders its practical application. To address this issue, we propose a sparse-view 3DGS method. Given the inherently ill-posed nature of sparse-view rendering, incorporating prior information is crucial. We propose a semantic regularization technique, using features extracted from the pretrained DINO-ViT model, to ensure multi-view semantic consistency. Additionally, we propose local depth regularization, which constrains depth values to improve generalization on unseen views. Our method outperforms state-of-the-art novel view synthesis approaches, achieving up to 0.4dB improvement in terms of PSNR on the LLFF dataset, with reduced distortion and enhanced visual quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11508v1-abstract-full').style.display = 'none'; document.getElementById('2501.11508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, has been accepted by the ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11039">arXiv:2501.11039</a> <span> [<a href="https://arxiv.org/pdf/2501.11039">pdf</a>, <a href="https://arxiv.org/format/2501.11039">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Any-Shot Adaptation: Predicting Optimization Outcome for Robustness Gains without Extra Pay </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Q+C">Qi Cheems Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yixiu Mao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">Yun Qu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yiqin Lv</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11039v3-abstract-short" style="display: inline;"> The foundation model enables general-purpose problem-solving and enjoys desirable rapid adaptation due to its adopted cross-task generalization paradigms, e.g., pretraining, meta-training, and finetuning. Recent advances in these paradigms show the crucial role of challenging tasks' prioritized sampling in enhancing adaptation robustness. However, ranking task difficulties exhausts massive task qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11039v3-abstract-full').style.display = 'inline'; document.getElementById('2501.11039v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11039v3-abstract-full" style="display: none;"> The foundation model enables general-purpose problem-solving and enjoys desirable rapid adaptation due to its adopted cross-task generalization paradigms, e.g., pretraining, meta-training, and finetuning. Recent advances in these paradigms show the crucial role of challenging tasks' prioritized sampling in enhancing adaptation robustness. However, ranking task difficulties exhausts massive task queries to evaluate, thus computation and annotation intensive, which is typically unaffordable in practice. This work underscores the criticality of both adaptation robustness and learning efficiency, especially in scenarios where tasks are risky or costly to evaluate, e.g., policy evaluations in Markov decision processes (MDPs) or inference with large models. To this end, we present Model Predictive Task Sampling (MPTS) to establish connections between the task space and adaptation risk landscape to form a theoretical guideline in robust active task sampling. MPTS characterizes the task episodic information with a generative model and directly predicts task-specific adaptation risk values from posterior inference. The developed risk learner can amortize expensive evaluation and provably approximately rank task difficulties in the pursuit of task robust adaptation. MPTS can be seamlessly integrated into zero-shot, few-shot, and many-shot learning paradigms. Extensive experimental results are conducted to exhibit the superiority of the proposed framework, remarkably increasing task adaptation robustness and retaining learning efficiency in contrast to existing state-of-the-art (SOTA) methods. The code is available at the project site https://github.com/thu-rllab/MPTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11039v3-abstract-full').style.display = 'none'; document.getElementById('2501.11039v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07711">arXiv:2501.07711</a> <span> [<a href="https://arxiv.org/pdf/2501.07711">pdf</a>, <a href="https://arxiv.org/format/2501.07711">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TMM.2024.3368931">10.1109/TMM.2024.3368931 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Pedestrian Trajectory Prediction Based on Social Interactions Learning With Random Weights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiajia Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+B">Beihao Xia</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhu Xiao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hongbo Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Siwang Zhou</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zheng Qin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07711v1-abstract-short" style="display: inline;"> Pedestrian trajectory prediction is a critical technology in the evolution of self-driving cars toward complete artificial intelligence. Over recent years, focusing on the trajectories of pedestrians to model their social interactions has surged with great interest in more accurate trajectory predictions. However, existing methods for modeling pedestrian social interactions rely on pre-defined rul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07711v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07711v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07711v1-abstract-full" style="display: none;"> Pedestrian trajectory prediction is a critical technology in the evolution of self-driving cars toward complete artificial intelligence. Over recent years, focusing on the trajectories of pedestrians to model their social interactions has surged with great interest in more accurate trajectory predictions. However, existing methods for modeling pedestrian social interactions rely on pre-defined rules, struggling to capture non-explicit social interactions. In this work, we propose a novel framework named DTGAN, which extends the application of Generative Adversarial Networks (GANs) to graph sequence data, with the primary objective of automatically capturing implicit social interactions and achieving precise predictions of pedestrian trajectory. DTGAN innovatively incorporates random weights within each graph to eliminate the need for pre-defined interaction rules. We further enhance the performance of DTGAN by exploring diverse task loss functions during adversarial training, which yields improvements of 16.7\% and 39.3\% on metrics ADE and FDE, respectively. The effectiveness and accuracy of our framework are verified on two public datasets. The experimental results show that our proposed DTGAN achieves superior performance and is well able to understand pedestrians' intentions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07711v1-abstract-full').style.display = 'none'; document.getElementById('2501.07711v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,7 figures,Accepted to IEEE Transactions on Multimedia (TMM)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07058">arXiv:2501.07058</a> <span> [<a href="https://arxiv.org/pdf/2501.07058">pdf</a>, <a href="https://arxiv.org/format/2501.07058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Logic Meets Magic: LLMs Cracking Smart Contract Vulnerabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">ZeKe Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qin Wang</a>, <a href="/search/cs?searchtype=author&query=Pearce%2C+H">Hammond Pearce</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shiping Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07058v1-abstract-short" style="display: inline;"> Smart contract vulnerabilities caused significant economic losses in blockchain applications. Large Language Models (LLMs) provide new possibilities for addressing this time-consuming task. However, state-of-the-art LLM-based detection solutions are often plagued by high false-positive rates. In this paper, we push the boundaries of existing research in two key ways. First, our evaluation is bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07058v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07058v1-abstract-full" style="display: none;"> Smart contract vulnerabilities caused significant economic losses in blockchain applications. Large Language Models (LLMs) provide new possibilities for addressing this time-consuming task. However, state-of-the-art LLM-based detection solutions are often plagued by high false-positive rates. In this paper, we push the boundaries of existing research in two key ways. First, our evaluation is based on Solidity v0.8, offering the most up-to-date insights compared to prior studies that focus on older versions (v0.4). Second, we leverage the latest five LLM models (across companies), ensuring comprehensive coverage across the most advanced capabilities in the field. We conducted a series of rigorous evaluations. Our experiments demonstrate that a well-designed prompt can reduce the false-positive rate by over 60%. Surprisingly, we also discovered that the recall rate for detecting some specific vulnerabilities in Solidity v0.8 has dropped to just 13% compared to earlier versions (i.e., v0.4). Further analysis reveals the root cause of this decline: the reliance of LLMs on identifying changes in newly introduced libraries and frameworks during detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07058v1-abstract-full').style.display = 'none'; document.getElementById('2501.07058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05892">arXiv:2501.05892</a> <span> [<a href="https://arxiv.org/pdf/2501.05892">pdf</a>, <a href="https://arxiv.org/format/2501.05892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Flat Text: Dual Self-inherited Guidance for Visual Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+M">Minxing Luo</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zixun Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liaojun Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenhang Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weichao Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianye Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Wentao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaxing Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05892v1-abstract-short" style="display: inline;"> In real-world images, slanted or curved texts, especially those on cans, banners, or badges, appear as frequently, if not more so, than flat texts due to artistic design or layout constraints. While high-quality visual text generation has become available with the advanced generative capabilities of diffusion models, these models often produce distorted text and inharmonious text background when g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05892v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05892v1-abstract-full" style="display: none;"> In real-world images, slanted or curved texts, especially those on cans, banners, or badges, appear as frequently, if not more so, than flat texts due to artistic design or layout constraints. While high-quality visual text generation has become available with the advanced generative capabilities of diffusion models, these models often produce distorted text and inharmonious text background when given slanted or curved text layouts due to training data limitation. In this paper, we introduce a new training-free framework, STGen, which accurately generates visual texts in challenging scenarios (\eg, slanted or curved text layouts) while harmonizing them with the text background. Our framework decomposes the visual text generation process into two branches: (i) \textbf{Semantic Rectification Branch}, which leverages the ability in generating flat but accurate visual texts of the model to guide the generation of challenging scenarios. The generated latent of flat text is abundant in accurate semantic information related both to the text itself and its background. By incorporating this, we rectify the semantic information of the texts and harmonize the integration of the text with its background in complex layouts. (ii) \textbf{Structure Injection Branch}, which reinforces the visual text structure during inference. We incorporate the latent information of the glyph image, rich in glyph structure, as a new condition to further strengthen the text structure. To enhance image harmony, we also apply an effective combination method to merge the priors, providing a solid foundation for generation. Extensive experiments across a variety of visual text layouts demonstrate that our framework achieves superior accuracy and outstanding quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05892v1-abstract-full').style.display = 'none'; document.getElementById('2501.05892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05727">arXiv:2501.05727</a> <span> [<a href="https://arxiv.org/pdf/2501.05727">pdf</a>, <a href="https://arxiv.org/format/2501.05727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enabling Scalable Oversight via Self-Evolving Critic </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhengyang Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziniu Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tian Ding</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05727v1-abstract-short" style="display: inline;"> Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05727v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05727v1-abstract-full" style="display: none;"> Despite their remarkable performance, the development of Large Language Models (LLMs) faces a critical challenge in scalable oversight: providing effective feedback for tasks where human evaluation is difficult or where LLMs outperform humans. While there is growing interest in using LLMs for critique, current approaches still rely on human annotations or more powerful models, leaving the issue of enhancing critique capabilities without external supervision unresolved. We introduce SCRIT (Self-evolving CRITic), a framework that enables genuine self-evolution of critique abilities. Technically, SCRIT self-improves by training on synthetic data, generated by a contrastive-based self-critic that uses reference solutions for step-by-step critique, and a self-validation mechanism that ensures critique quality through correction outcomes. Implemented with Qwen2.5-72B-Instruct, one of the most powerful LLMs, SCRIT achieves up to a 10.3\% improvement on critique-correction and error identification benchmarks. Our analysis reveals that SCRIT's performance scales positively with data and model size, outperforms alternative approaches, and benefits critically from its self-validation component. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05727v1-abstract-full').style.display = 'none'; document.getElementById('2501.05727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04268">arXiv:2501.04268</a> <span> [<a href="https://arxiv.org/pdf/2501.04268">pdf</a>, <a href="https://arxiv.org/format/2501.04268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Robotic Programmer: Video Instructed Policy Code Generation for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+S">Senwei Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhanqi Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruiping Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xilin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04268v1-abstract-short" style="display: inline;"> Zero-shot generalization across various robots, tasks and environments remains a significant challenge in robotic manipulation. Policy code generation methods use executable code to connect high-level task descriptions and low-level action sequences, leveraging the generalization capabilities of large language models and atomic skill libraries. In this work, we propose Robotic Programmer (RoboPro)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04268v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04268v1-abstract-full" style="display: none;"> Zero-shot generalization across various robots, tasks and environments remains a significant challenge in robotic manipulation. Policy code generation methods use executable code to connect high-level task descriptions and low-level action sequences, leveraging the generalization capabilities of large language models and atomic skill libraries. In this work, we propose Robotic Programmer (RoboPro), a robotic foundation model, enabling the capability of perceiving visual information and following free-form instructions to perform robotic manipulation with policy code in a zero-shot manner. To address low efficiency and high cost in collecting runtime code data for robotic tasks, we devise Video2Code to synthesize executable code from extensive videos in-the-wild with off-the-shelf vision-language model and code-domain large language model. Extensive experiments show that RoboPro achieves the state-of-the-art zero-shot performance on robotic manipulation in both simulators and real-world environments. Specifically, the zero-shot success rate of RoboPro on RLBench surpasses the state-of-the-art model GPT-4o by 11.6%, which is even comparable to a strong supervised training baseline. Furthermore, RoboPro is robust to variations on API formats and skill sets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04268v1-abstract-full').style.display = 'none'; document.getElementById('2501.04268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01568">arXiv:2501.01568</a> <span> [<a href="https://arxiv.org/pdf/2501.01568">pdf</a>, <a href="https://arxiv.org/format/2501.01568">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Interruption Handling for Conversational Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+S">Shiye Cao</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+J">Jiwon Moon</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+A">Amama Mahmood</a>, <a href="/search/cs?searchtype=author&query=Antony%2C+V+N">Victor Nikhil Antony</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Ziang Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Anqi Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-Ming Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01568v1-abstract-short" style="display: inline;"> Interruptions, a fundamental component of human communication, can enhance the dynamism and effectiveness of conversations, but only when effectively managed by all parties involved. Despite advancements in robotic systems, state-of-the-art systems still have limited capabilities in handling user-initiated interruptions in real-time. Prior research has primarily focused on post hoc analysis of int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01568v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01568v1-abstract-full" style="display: none;"> Interruptions, a fundamental component of human communication, can enhance the dynamism and effectiveness of conversations, but only when effectively managed by all parties involved. Despite advancements in robotic systems, state-of-the-art systems still have limited capabilities in handling user-initiated interruptions in real-time. Prior research has primarily focused on post hoc analysis of interruptions. To address this gap, we present a system that detects user-initiated interruptions and manages them in real-time based on the interrupter's intent (i.e., cooperative agreement, cooperative assistance, cooperative clarification, or disruptive interruption). The system was designed based on interaction patterns identified from human-human interaction data. We integrated our system into an LLM-powered social robot and validated its effectiveness through a timed decision-making task and a contentious discussion task with 21 participants. Our system successfully handled 93.69% (n=104/111) of user-initiated interruptions. We discuss our learnings and their implications for designing interruption-handling behaviors in conversational robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01568v1-abstract-full').style.display = 'none'; document.getElementById('2501.01568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20083">arXiv:2412.20083</a> <span> [<a href="https://arxiv.org/pdf/2412.20083">pdf</a>, <a href="https://arxiv.org/format/2412.20083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Achieving Full-Bandwidth Sensing Performance with Partial Bandwidth Allocation for ISAC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiqiang Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhiwen Zhou</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Q">Qianglong Dai</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yong Zeng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fei Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20083v1-abstract-short" style="display: inline;"> This letter studies an uplink integrated sensing and communication (ISAC) system using discrete Fourier transform spread orthogonal frequency division multiplexing (DFT-s-OFDM) transmission. We try to answer the following fundamental question: With only a fractional bandwidth allocated to the user with sensing task, can the same delay resolution and unambiguous range be achieved as if all bandwidt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20083v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20083v1-abstract-full" style="display: none;"> This letter studies an uplink integrated sensing and communication (ISAC) system using discrete Fourier transform spread orthogonal frequency division multiplexing (DFT-s-OFDM) transmission. We try to answer the following fundamental question: With only a fractional bandwidth allocated to the user with sensing task, can the same delay resolution and unambiguous range be achieved as if all bandwidth were allocated to it? We affirmatively answer the question by proposing a novel two-stage delay estimation (TSDE) method that exploits the following facts: without increasing the allocated bandwidth, higher delay resolution can be achieved via distributed subcarrier allocation compared to its collocated counterpart, while there is a trade-off between delay resolution and unambiguous range by varying the decimation factor of subcarriers. Therefore, the key idea of the proposed TSDE method is to first perform coarse delay estimation with collocated subcarriers to achieve a large unambiguous range, and then use distributed subcarriers with optimized decimation factor to enhance delay resolution while avoiding delay ambiguity. Our analysis shows that the proposed TSDE method can achieve the full-bandwidth delay resolution and unambiguous range, by using only at most half of the full bandwidth, provided that the channel delay spread is less than half of the unambiguous range. Numerical results show the superiority of the proposed method over the conventional method with collocated subcarriers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20083v1-abstract-full').style.display = 'none'; document.getElementById('2412.20083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18111">arXiv:2412.18111</a> <span> [<a href="https://arxiv.org/pdf/2412.18111">pdf</a>, <a href="https://arxiv.org/format/2412.18111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AIGT: AI Generative Table Based on Prompt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiqing Xiao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+G">Guoshan Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sai Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+X">Xing Fu</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Can Yi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junbo Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18111v1-abstract-short" style="display: inline;"> Tabular data, which accounts for over 80% of enterprise data assets, is vital in various fields. With growing concerns about privacy protection and data-sharing restrictions, generating high-quality synthetic tabular data has become essential. Recent advancements show that large language models (LLMs) can effectively gener-ate realistic tabular data by leveraging semantic information and overcomin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18111v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18111v1-abstract-full" style="display: none;"> Tabular data, which accounts for over 80% of enterprise data assets, is vital in various fields. With growing concerns about privacy protection and data-sharing restrictions, generating high-quality synthetic tabular data has become essential. Recent advancements show that large language models (LLMs) can effectively gener-ate realistic tabular data by leveraging semantic information and overcoming the challenges of high-dimensional data that arise from one-hot encoding. However, current methods do not fully utilize the rich information available in tables. To address this, we introduce AI Generative Table (AIGT) based on prompt enhancement, a novel approach that utilizes meta data information, such as table descriptions and schemas, as prompts to generate ultra-high quality synthetic data. To overcome the token limit constraints of LLMs, we propose long-token partitioning algorithms that enable AIGT to model tables of any scale. AIGT achieves state-of-the-art performance on 14 out of 20 public datasets and two real industry datasets within the Alipay risk control system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18111v1-abstract-full').style.display = 'none'; document.getElementById('2412.18111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16928">arXiv:2412.16928</a> <span> [<a href="https://arxiv.org/pdf/2412.16928">pdf</a>, <a href="https://arxiv.org/format/2412.16928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AV-DTEC: Self-Supervised Audio-Visual Fusion for Drone Trajectory Estimation and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyuan Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yizhuo Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guili Xu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xianglong Zeng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16928v1-abstract-short" style="display: inline;"> The increasing use of compact UAVs has created significant threats to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we propose AV-DTEC, a lightweight self-supervised audio-visual fusion-based anti-UAV system. AV-DTEC is trained using self-supervised learning with labels generated by LiDAR, and it simultaneously learns audio and vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16928v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16928v1-abstract-full" style="display: none;"> The increasing use of compact UAVs has created significant threats to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we propose AV-DTEC, a lightweight self-supervised audio-visual fusion-based anti-UAV system. AV-DTEC is trained using self-supervised learning with labels generated by LiDAR, and it simultaneously learns audio and visual features through a parallel selective state-space model. With the learned features, a specially designed plug-and-play primary-auxiliary feature enhancement module integrates visual features into audio features for better robustness in cross-lighting conditions. To reduce reliance on auxiliary features and align modalities, we propose a teacher-student model that adaptively adjusts the weighting of visual features. AV-DTEC demonstrates exceptional accuracy and effectiveness in real-world multi-modality data. The code and trained models are publicly accessible on GitHub \url{https://github.com/AmazingDay1/AV-DETC}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16928v1-abstract-full').style.display = 'none'; document.getElementById('2412.16928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15005">arXiv:2412.15005</a> <span> [<a href="https://arxiv.org/pdf/2412.15005">pdf</a>, <a href="https://arxiv.org/format/2412.15005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DisCo: Graph-Based Disentangled Contrastive Learning for Cold-Start Cross-Domain Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Hourun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jia Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Changling Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15005v3-abstract-short" style="display: inline;"> Recommender systems are widely used in various real-world applications, but they often encounter the persistent challenge of the user cold-start problem. Cross-domain recommendation (CDR), which leverages user interactions from one domain to improve prediction performance in another, has emerged as a promising solution. However, users with similar preferences in the source domain may exhibit diffe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15005v3-abstract-full').style.display = 'inline'; document.getElementById('2412.15005v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15005v3-abstract-full" style="display: none;"> Recommender systems are widely used in various real-world applications, but they often encounter the persistent challenge of the user cold-start problem. Cross-domain recommendation (CDR), which leverages user interactions from one domain to improve prediction performance in another, has emerged as a promising solution. However, users with similar preferences in the source domain may exhibit different interests in the target domain. Therefore, directly transferring embeddings may introduce irrelevant source-domain collaborative information. In this paper, we propose a novel graph-based disentangled contrastive learning framework to capture fine-grained user intent and filter out irrelevant collaborative information, thereby avoiding negative transfer. Specifically, for each domain, we use a multi-channel graph encoder to capture diverse user intents. We then construct the affinity graph in the embedding space and perform multi-step random walks to capture high-order user similarity relationships. Treating one domain as the target, we propose a disentangled intent-wise contrastive learning approach, guided by user similarity, to refine the bridging of user intents across domains. Extensive experiments on four benchmark CDR datasets demonstrate that DisCo consistently outperforms existing state-of-the-art baselines, thereby validating the effectiveness of both DisCo and its components. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15005v3-abstract-full').style.display = 'none'; document.getElementById('2412.15005v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14922">arXiv:2412.14922</a> <span> [<a href="https://arxiv.org/pdf/2412.14922">pdf</a>, <a href="https://arxiv.org/format/2412.14922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RobustFT: Robust Supervised Fine-tuning for Large Language Models under Noisy Response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kaize Ding</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14922v1-abstract-short" style="display: inline;"> Supervised fine-tuning (SFT) plays a crucial role in adapting large language models (LLMs) to specific domains or tasks. However, as demonstrated by empirical experiments, the collected data inevitably contains noise in practical applications, which poses significant challenges to model performance on downstream tasks. Therefore, there is an urgent need for a noise-robust SFT framework to enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14922v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14922v1-abstract-full" style="display: none;"> Supervised fine-tuning (SFT) plays a crucial role in adapting large language models (LLMs) to specific domains or tasks. However, as demonstrated by empirical experiments, the collected data inevitably contains noise in practical applications, which poses significant challenges to model performance on downstream tasks. Therefore, there is an urgent need for a noise-robust SFT framework to enhance model capabilities in downstream tasks. To address this challenge, we introduce a robust SFT framework (RobustFT) that performs noise detection and relabeling on downstream task data. For noise identification, our approach employs a multi-expert collaborative system with inference-enhanced models to achieve superior noise detection. In the denoising phase, we utilize a context-enhanced strategy, which incorporates the most relevant and confident knowledge followed by careful assessment to generate reliable annotations. Additionally, we introduce an effective data selection mechanism based on response entropy, ensuring only high-quality samples are retained for fine-tuning. Extensive experiments conducted on multiple LLMs across five datasets demonstrate RobustFT's exceptional performance in noisy scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14922v1-abstract-full').style.display = 'none'; document.getElementById('2412.14922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14456">arXiv:2412.14456</a> <span> [<a href="https://arxiv.org/pdf/2412.14456">pdf</a>, <a href="https://arxiv.org/format/2412.14456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> LEDiff: Latent Exposure Diffusion for HDR Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihao Xia</a>, <a href="/search/cs?searchtype=author&query=Leimkuehler%2C+T">Thomas Leimkuehler</a>, <a href="/search/cs?searchtype=author&query=Myszkowski%2C+K">Karol Myszkowski</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuaner Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14456v2-abstract-short" style="display: inline;"> While consumer displays increasingly support more than 10 stops of dynamic range, most image assets such as internet photographs and generative AI content remain limited to 8-bit low dynamic range (LDR), constraining their utility across high dynamic range (HDR) applications. Currently, no generative model can produce high-bit, high-dynamic range content in a generalizable way. Existing LDR-to-HDR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14456v2-abstract-full').style.display = 'inline'; document.getElementById('2412.14456v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14456v2-abstract-full" style="display: none;"> While consumer displays increasingly support more than 10 stops of dynamic range, most image assets such as internet photographs and generative AI content remain limited to 8-bit low dynamic range (LDR), constraining their utility across high dynamic range (HDR) applications. Currently, no generative model can produce high-bit, high-dynamic range content in a generalizable way. Existing LDR-to-HDR conversion methods often struggle to produce photorealistic details and physically-plausible dynamic range in the clipped areas. We introduce LEDiff, a method that enables a generative model with HDR content generation through latent space fusion inspired by image-space exposure fusion techniques. It also functions as an LDR-to-HDR converter, expanding the dynamic range of existing low-dynamic range images. Our approach uses a small HDR dataset to enable a pretrained diffusion model to recover detail and dynamic range in clipped highlights and shadows. LEDiff brings HDR capabilities to existing generative models and converts any LDR image to HDR, creating photorealistic HDR outputs for image generation, image-based lighting (HDR environment map generation), and photographic effects such as depth of field simulation, where linear HDR data is essential for realistic quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14456v2-abstract-full').style.display = 'none'; document.getElementById('2412.14456v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13735">arXiv:2412.13735</a> <span> [<a href="https://arxiv.org/pdf/2412.13735">pdf</a>, <a href="https://arxiv.org/format/2412.13735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3D Registration in 30 Years: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaqi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chu'ai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhengbao Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xinyue Cao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+X">Xuan Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhenxuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhao Zeng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+B">Borui Lu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhiyi Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yulan Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanning Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13735v2-abstract-short" style="display: inline;"> 3D point cloud registration is a fundamental problem in computer vision, computer graphics, robotics, remote sensing, and etc. Over the last thirty years, we have witnessed the amazing advancement in this area with numerous kinds of solutions. Although a handful of relevant surveys have been conducted, their coverage is still limited. In this work, we present a comprehensive survey on 3D point clo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13735v2-abstract-full').style.display = 'inline'; document.getElementById('2412.13735v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13735v2-abstract-full" style="display: none;"> 3D point cloud registration is a fundamental problem in computer vision, computer graphics, robotics, remote sensing, and etc. Over the last thirty years, we have witnessed the amazing advancement in this area with numerous kinds of solutions. Although a handful of relevant surveys have been conducted, their coverage is still limited. In this work, we present a comprehensive survey on 3D point cloud registration, covering a set of sub-areas such as pairwise coarse registration, pairwise fine registration, multi-view registration, cross-scale registration, and multi-instance registration. The datasets, evaluation metrics, method taxonomy, discussions of the merits and demerits, insightful thoughts of future directions are comprehensively presented in this survey. The regularly updated project page of the survey is available at https://github.com/Amyyyy11/3D-Registration-in-30-Years-A-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13735v2-abstract-full').style.display = 'none'; document.getElementById('2412.13735v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13037">arXiv:2412.13037</a> <span> [<a href="https://arxiv.org/pdf/2412.13037">pdf</a>, <a href="https://arxiv.org/format/2412.13037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TAME: Temporal Audio-based Mamba for Enhanced Drone Trajectory Estimation and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyuan Xiao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Huanran Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guili Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junwei He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13037v6-abstract-short" style="display: inline;"> The increasing prevalence of compact UAVs has introduced significant risks to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we present TAME, the Temporal Audio-based Mamba for Enhanced Drone Trajectory Estimation and Classification. This innovative anti-UAV detection model leverages a parallel selective state-space model to simult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13037v6-abstract-full').style.display = 'inline'; document.getElementById('2412.13037v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13037v6-abstract-full" style="display: none;"> The increasing prevalence of compact UAVs has introduced significant risks to public safety, while traditional drone detection systems are often bulky and costly. To address these challenges, we present TAME, the Temporal Audio-based Mamba for Enhanced Drone Trajectory Estimation and Classification. This innovative anti-UAV detection model leverages a parallel selective state-space model to simultaneously capture and learn both the temporal and spectral features of audio, effectively analyzing propagation of sound. To further enhance temporal features, we introduce a Temporal Feature Enhancement Module, which integrates spectral features into temporal data using residual cross-attention. This enhanced temporal information is then employed for precise 3D trajectory estimation and classification. Our model sets a new standard of performance on the MMUAD benchmarks, demonstrating superior accuracy and effectiveness. The code and trained models are publicly available on GitHub \url{https://github.com/AmazingDay1/TAME}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13037v6-abstract-full').style.display = 'none'; document.getElementById('2412.13037v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for presentation at the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2025. Personal use of this material is permitted. Permission from IEEE must be obtained for all other uses</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12984">arXiv:2412.12984</a> <span> [<a href="https://arxiv.org/pdf/2412.12984">pdf</a>, <a href="https://arxiv.org/format/2412.12984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Cluster-guided Contrastive Class-imbalanced Graph Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zhengyang Mao</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+S">Siyu Yi</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yifang Qin</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yiyang Gu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jianhao Shen</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Z">Ziyue Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12984v2-abstract-short" style="display: inline;"> This paper studies the problem of class-imbalanced graph classification, which aims at effectively classifying the graph categories in scenarios with imbalanced class distributions. While graph neural networks (GNNs) have achieved remarkable success, their modeling ability on imbalanced graph-structured data remains suboptimal, which typically leads to predictions biased towards the majority class… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12984v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12984v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12984v2-abstract-full" style="display: none;"> This paper studies the problem of class-imbalanced graph classification, which aims at effectively classifying the graph categories in scenarios with imbalanced class distributions. While graph neural networks (GNNs) have achieved remarkable success, their modeling ability on imbalanced graph-structured data remains suboptimal, which typically leads to predictions biased towards the majority classes. On the other hand, existing class-imbalanced learning methods in vision may overlook the rich graph semantic substructures of the majority classes and excessively emphasize learning from the minority classes. To address these challenges, we propose a simple yet powerful approach called C$^3$GNN that integrates the idea of clustering into contrastive learning to enhance class-imbalanced graph classification. Technically, C$^3$GNN clusters graphs from each majority class into multiple subclasses, with sizes comparable to the minority class, mitigating class imbalance. It also employs the Mixup technique to generate synthetic samples, enriching the semantic diversity of each subclass. Furthermore, supervised contrastive learning is used to hierarchically learn effective graph representations, enabling the model to thoroughly explore semantic substructures in majority classes while avoiding excessive focus on minority classes. Extensive experiments on real-world graph benchmark datasets verify the superior performance of our proposed method against competitive baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12984v2-abstract-full').style.display = 'none'; document.getElementById('2412.12984v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Proceedings of the Thirty-Ninth AAAI Conference on Artificial Intelligence (AAAI-25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12531">arXiv:2412.12531</a> <span> [<a href="https://arxiv.org/pdf/2412.12531">pdf</a>, <a href="https://arxiv.org/ps/2412.12531">ps</a>, <a href="https://arxiv.org/format/2412.12531">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Movable Antenna Aided NOMA: Joint Antenna Positioning, Precoding, and Decoding Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhenyu Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lipeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/cs?searchtype=author&query=da+Costa%2C+D+B">Daniel Benevides da Costa</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xiang-Gen Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12531v1-abstract-short" style="display: inline;"> This paper investigates movable antenna (MA) aided non-orthogonal multiple access (NOMA) for multi-user downlink communication, where the base station (BS) is equipped with a fixed-position antenna (FPA) array to serve multiple MA-enabled users. An optimization problem is formulated to maximize the minimum achievable rate among all the users by jointly optimizing the MA positioning of each user, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12531v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12531v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12531v1-abstract-full" style="display: none;"> This paper investigates movable antenna (MA) aided non-orthogonal multiple access (NOMA) for multi-user downlink communication, where the base station (BS) is equipped with a fixed-position antenna (FPA) array to serve multiple MA-enabled users. An optimization problem is formulated to maximize the minimum achievable rate among all the users by jointly optimizing the MA positioning of each user, the precoding matrix at the BS, and the successive interference cancellation (SIC) decoding indicator matrix at the users, subject to a set of constraints including the limited movement area of the MAs, the maximum transmit power of the BS, and the SIC decoding condition. To solve this non-convex problem, we propose a two-loop iterative optimization algorithm that combines the hippopotamus optimization (HO) method with the alternating optimization (AO) method to obtain a suboptimal solution efficiently. Specifically, in the inner loop, the complex-valued precoding matrix and the binary decoding indicator matrix are optimized alternatively by the successive convex approximation (SCA) technique with customized greedy search to maximize the minimum achievable rate for the given positions of the MAs. In the outer loop, each user's antenna position is updated using the HO algorithm, following a novel nature-inspired intelligent optimization framework. Simulation results show that the proposed algorithms can effectively avoid local optimum for highly coupled variables and significantly improve the rate performance of the NOMA system compared to the conventional FPA system as well as other benchmark schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12531v1-abstract-full').style.display = 'none'; document.getElementById('2412.12531v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12201">arXiv:2412.12201</a> <span> [<a href="https://arxiv.org/pdf/2412.12201">pdf</a>, <a href="https://arxiv.org/format/2412.12201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Embracing Large Language Models in Traffic Flow Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yusheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+H">Haomin Wen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12201v1-abstract-short" style="display: inline;"> Traffic flow forecasting aims to predict future traffic flows based on the historical traffic conditions and the road network. It is an important problem in intelligent transportation systems, with a plethora of methods been proposed. Existing efforts mainly focus on capturing and utilizing spatio-temporal dependencies to predict future traffic flows. Though promising, they fall short in adapting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12201v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12201v1-abstract-full" style="display: none;"> Traffic flow forecasting aims to predict future traffic flows based on the historical traffic conditions and the road network. It is an important problem in intelligent transportation systems, with a plethora of methods been proposed. Existing efforts mainly focus on capturing and utilizing spatio-temporal dependencies to predict future traffic flows. Though promising, they fall short in adapting to test-time environmental changes of traffic conditions. To tackle this challenge, we propose to introduce large language models (LLMs) to help traffic flow forecasting and design a novel method named Large Language Model Enhanced Traffic Flow Predictor (LEAF). LEAF adopts two branches, capturing different spatio-temporal relations using graph and hypergraph structures respectively. The two branches are first pre-trained individually, and during test-time, they yield different predictions. Based on these predictions, a large language model is used to select the most likely result. Then, a ranking loss is applied as the learning objective to enhance the prediction ability of the two branches. Extensive experiments on several datasets demonstrate the effectiveness of the proposed LEAF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12201v1-abstract-full').style.display = 'none'; document.getElementById('2412.12201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12154">arXiv:2412.12154</a> <span> [<a href="https://arxiv.org/pdf/2412.12154">pdf</a>, <a href="https://arxiv.org/format/2412.12154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PyOD 2: A Python Library for Outlier Detection with LLM-powered Model Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sihan Chen</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zhuangzhuang Qian</a>, <a href="/search/cs?searchtype=author&query=Siu%2C+W">Wingchun Siu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xingcan Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shawn Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yuehan Qin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tiankai Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhuo Xiao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wanghao Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12154v1-abstract-short" style="display: inline;"> Outlier detection (OD), also known as anomaly detection, is a critical machine learning (ML) task with applications in fraud detection, network intrusion detection, clickstream analysis, recommendation systems, and social network moderation. Among open-source libraries for outlier detection, the Python Outlier Detection (PyOD) library is the most widely adopted, with over 8,500 GitHub stars, 25 mi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12154v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12154v1-abstract-full" style="display: none;"> Outlier detection (OD), also known as anomaly detection, is a critical machine learning (ML) task with applications in fraud detection, network intrusion detection, clickstream analysis, recommendation systems, and social network moderation. Among open-source libraries for outlier detection, the Python Outlier Detection (PyOD) library is the most widely adopted, with over 8,500 GitHub stars, 25 million downloads, and diverse industry usage. However, PyOD currently faces three limitations: (1) insufficient coverage of modern deep learning algorithms, (2) fragmented implementations across PyTorch and TensorFlow, and (3) no automated model selection, making it hard for non-experts. To address these issues, we present PyOD Version 2 (PyOD 2), which integrates 12 state-of-the-art deep learning models into a unified PyTorch framework and introduces a large language model (LLM)-based pipeline for automated OD model selection. These improvements simplify OD workflows, provide access to 45 algorithms, and deliver robust performance on various datasets. In this paper, we demonstrate how PyOD 2 streamlines the deployment and automation of OD models and sets a new standard in both research and industry. PyOD 2 is accessible at [https://github.com/yzhao062/pyod](https://github.com/yzhao062/pyod). This study aligns with the Web Mining and Content Analysis track, addressing topics such as the robustness of Web mining methods and the quality of algorithmically-generated Web data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12154v1-abstract-full').style.display = 'none'; document.getElementById('2412.12154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12087">arXiv:2412.12087</a> <span> [<a href="https://arxiv.org/pdf/2412.12087">pdf</a>, <a href="https://arxiv.org/format/2412.12087">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Instruction-based Image Manipulation by Watching How Things Move </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Mingdeng Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuaner Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yinqiang Zheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihao Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12087v1-abstract-short" style="display: inline;"> This paper introduces a novel dataset construction pipeline that samples pairs of frames from videos and uses multimodal large language models (MLLMs) to generate editing instructions for training instruction-based image manipulation models. Video frames inherently preserve the identity of subjects and scenes, ensuring consistent content preservation during editing. Additionally, video data captur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12087v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12087v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12087v1-abstract-full" style="display: none;"> This paper introduces a novel dataset construction pipeline that samples pairs of frames from videos and uses multimodal large language models (MLLMs) to generate editing instructions for training instruction-based image manipulation models. Video frames inherently preserve the identity of subjects and scenes, ensuring consistent content preservation during editing. Additionally, video data captures diverse, natural dynamics-such as non-rigid subject motion and complex camera movements-that are difficult to model otherwise, making it an ideal source for scalable dataset construction. Using this approach, we create a new dataset to train InstructMove, a model capable of instruction-based complex manipulations that are difficult to achieve with synthetically generated datasets. Our model demonstrates state-of-the-art performance in tasks such as adjusting subject poses, rearranging elements, and altering camera perspectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12087v1-abstract-full').style.display = 'none'; document.getElementById('2412.12087v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ljzycmd.github.io/projects/InstructMove/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11142">arXiv:2412.11142</a> <span> [<a href="https://arxiv.org/pdf/2412.11142">pdf</a>, <a href="https://arxiv.org/format/2412.11142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AD-LLM: Benchmarking Large Language Models for Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tiankai Yang</a>, <a href="/search/cs?searchtype=author&query=Nian%2C+Y">Yi Nian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shawn Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruiyao Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuangang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhuo Xiao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiyang Hu</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R">Ryan Rossi</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kaize Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11142v1-abstract-short" style="display: inline;"> Anomaly detection (AD) is an important machine learning task with many real-world uses, including fraud detection, medical diagnosis, and industrial monitoring. Within natural language processing (NLP), AD helps detect issues like spam, misinformation, and unusual user activity. Although large language models (LLMs) have had a strong impact on tasks such as text generation and summarization, their… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11142v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11142v1-abstract-full" style="display: none;"> Anomaly detection (AD) is an important machine learning task with many real-world uses, including fraud detection, medical diagnosis, and industrial monitoring. Within natural language processing (NLP), AD helps detect issues like spam, misinformation, and unusual user activity. Although large language models (LLMs) have had a strong impact on tasks such as text generation and summarization, their potential in AD has not been studied enough. This paper introduces AD-LLM, the first benchmark that evaluates how LLMs can help with NLP anomaly detection. We examine three key tasks: (i) zero-shot detection, using LLMs' pre-trained knowledge to perform AD without tasks-specific training; (ii) data augmentation, generating synthetic data and category descriptions to improve AD models; and (iii) model selection, using LLMs to suggest unsupervised AD models. Through experiments with different datasets, we find that LLMs can work well in zero-shot AD, that carefully designed augmentation methods are useful, and that explaining model selection for specific datasets remains challenging. Based on these results, we outline six future research directions on LLMs for AD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11142v1-abstract-full').style.display = 'none'; document.getElementById('2412.11142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09627">arXiv:2412.09627</a> <span> [<a href="https://arxiv.org/pdf/2412.09627">pdf</a>, <a href="https://arxiv.org/format/2412.09627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Doe-1: Closed-Loop Autonomous Driving with Large World Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zetian Xia</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuanhui Huang</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+S">Sicheng Zuo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiwen Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09627v1-abstract-short" style="display: inline;"> End-to-end autonomous driving has received increasing attention due to its potential to learn from large amounts of data. However, most existing methods are still open-loop and suffer from weak scalability, lack of high-order interactions, and inefficient decision-making. In this paper, we explore a closed-loop framework for autonomous driving and propose a large Driving wOrld modEl (Doe-1) for un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09627v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09627v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09627v1-abstract-full" style="display: none;"> End-to-end autonomous driving has received increasing attention due to its potential to learn from large amounts of data. However, most existing methods are still open-loop and suffer from weak scalability, lack of high-order interactions, and inefficient decision-making. In this paper, we explore a closed-loop framework for autonomous driving and propose a large Driving wOrld modEl (Doe-1) for unified perception, prediction, and planning. We formulate autonomous driving as a next-token generation problem and use multi-modal tokens to accomplish different tasks. Specifically, we use free-form texts (i.e., scene descriptions) for perception and generate future predictions directly in the RGB space with image tokens. For planning, we employ a position-aware tokenizer to effectively encode action into discrete tokens. We train a multi-modal transformer to autoregressively generate perception, prediction, and planning tokens in an end-to-end and unified manner. Experiments on the widely used nuScenes dataset demonstrate the effectiveness of Doe-1 in various tasks including visual question-answering, action-conditioned video generation, and motion planning. Code: https://github.com/wzzheng/Doe. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09627v1-abstract-full').style.display = 'none'; document.getElementById('2412.09627v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code is available at: https://github.com/wzzheng/Doe</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07775">arXiv:2412.07775</a> <span> [<a href="https://arxiv.org/pdf/2412.07775">pdf</a>, <a href="https://arxiv.org/format/2412.07775">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Diversity-Preserving Diffusion Alignment via Gradient-Informed GFlowNets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T+Z">Tim Z. Xiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiyang Liu</a>, <a href="/search/cs?searchtype=author&query=Bengio%2C+Y">Yoshua Bengio</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dinghuai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07775v1-abstract-short" style="display: inline;"> While one commonly trains large diffusion models by collecting datasets on target downstream tasks, it is often desired to align and finetune pretrained diffusion models on some reward functions that are either designed by experts or learned from small-scale datasets. Existing methods for finetuning diffusion models typically suffer from lack of diversity in generated samples, lack of prior preser… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07775v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07775v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07775v1-abstract-full" style="display: none;"> While one commonly trains large diffusion models by collecting datasets on target downstream tasks, it is often desired to align and finetune pretrained diffusion models on some reward functions that are either designed by experts or learned from small-scale datasets. Existing methods for finetuning diffusion models typically suffer from lack of diversity in generated samples, lack of prior preservation, and/or slow convergence in finetuning. Inspired by recent successes in generative flow networks (GFlowNets), a class of probabilistic models that sample with the unnormalized density of a reward function, we propose a novel GFlowNet method dubbed Nabla-GFlowNet (abbreviated as $\nabla$-GFlowNet), the first GFlowNet method that leverages the rich signal in reward gradients, together with an objective called $\nabla$-DB plus its variant residual $\nabla$-DB designed for prior-preserving diffusion alignment. We show that our proposed method achieves fast yet diversity- and prior-preserving alignment of Stable Diffusion, a large-scale text-conditioned image diffusion model, on different realistic reward functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07775v1-abstract-full').style.display = 'none'; document.getElementById('2412.07775v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report (35 pages, 31 figures)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07260">arXiv:2412.07260</a> <span> [<a href="https://arxiv.org/pdf/2412.07260">pdf</a>, <a href="https://arxiv.org/format/2412.07260">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DFREC: DeepFake Identity Recovery Based on Identity-aware Masked Autoencoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+P">Peipeng Yu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hui Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhitao Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhihua Xia</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Chip-Hong Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07260v1-abstract-short" style="display: inline;"> Recent advances in deepfake forensics have primarily focused on improving the classification accuracy and generalization performance. Despite enormous progress in detection accuracy across a wide variety of forgery algorithms, existing algorithms lack intuitive interpretability and identity traceability to help with forensic investigation. In this paper, we introduce a novel DeepFake Identity Reco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07260v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07260v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07260v1-abstract-full" style="display: none;"> Recent advances in deepfake forensics have primarily focused on improving the classification accuracy and generalization performance. Despite enormous progress in detection accuracy across a wide variety of forgery algorithms, existing algorithms lack intuitive interpretability and identity traceability to help with forensic investigation. In this paper, we introduce a novel DeepFake Identity Recovery scheme (DFREC) to fill this gap. DFREC aims to recover the pair of source and target faces from a deepfake image to facilitate deepfake identity tracing and reduce the risk of deepfake attack. It comprises three key components: an Identity Segmentation Module (ISM), a Source Identity Reconstruction Module (SIRM), and a Target Identity Reconstruction Module (TIRM). The ISM segments the input face into distinct source and target face information, and the SIRM reconstructs the source face and extracts latent target identity features with the segmented source information. The background context and latent target identity features are synergetically fused by a Masked Autoencoder in the TIRM to reconstruct the target face. We evaluate DFREC on six different high-fidelity face-swapping attacks on FaceForensics++, CelebaMegaFS and FFHQ-E4S datasets, which demonstrate its superior recovery performance over state-of-the-art deepfake recovery algorithms. In addition, DFREC is the only scheme that can recover both pristine source and target faces directly from the forgery image with high fadelity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07260v1-abstract-full').style.display = 'none'; document.getElementById('2412.07260v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06590">arXiv:2412.06590</a> <span> [<a href="https://arxiv.org/pdf/2412.06590">pdf</a>, <a href="https://arxiv.org/format/2412.06590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Divide: Reconsidering Softmax and Linear Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+D">Dongchen Han</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yifan Pu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhuofan Xia</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yizeng Han</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xuran Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiu Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shiji Song</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Gao Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06590v1-abstract-short" style="display: inline;"> Widely adopted in modern Vision Transformer designs, Softmax attention can effectively capture long-range visual information; however, it incurs excessive computational cost when dealing with high-resolution inputs. In contrast, linear attention naturally enjoys linear complexity and has great potential to scale up to higher-resolution images. Nonetheless, the unsatisfactory performance of linear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06590v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06590v1-abstract-full" style="display: none;"> Widely adopted in modern Vision Transformer designs, Softmax attention can effectively capture long-range visual information; however, it incurs excessive computational cost when dealing with high-resolution inputs. In contrast, linear attention naturally enjoys linear complexity and has great potential to scale up to higher-resolution images. Nonetheless, the unsatisfactory performance of linear attention greatly limits its practical application in various scenarios. In this paper, we take a step forward to close the gap between the linear and Softmax attention with novel theoretical analyses, which demystify the core factors behind the performance deviations. Specifically, we present two key perspectives to understand and alleviate the limitations of linear attention: the injective property and the local modeling ability. Firstly, we prove that linear attention is not injective, which is prone to assign identical attention weights to different query vectors, thus adding to severe semantic confusion since different queries correspond to the same outputs. Secondly, we confirm that effective local modeling is essential for the success of Softmax attention, in which linear attention falls short. The aforementioned two fundamental differences significantly contribute to the disparities between these two attention paradigms, which is demonstrated by our substantial empirical validation in the paper. In addition, more experiment results indicate that linear attention, as long as endowed with these two properties, can outperform Softmax attention across various tasks while maintaining lower computation complexity. Code is available at https://github.com/LeapLabTHU/InLine. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06590v1-abstract-full').style.display = 'none'; document.getElementById('2412.06590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05569">arXiv:2412.05569</a> <span> [<a href="https://arxiv.org/pdf/2412.05569">pdf</a>, <a href="https://arxiv.org/format/2412.05569">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> SMI-Editor: Edit-based SMILES Language Model with Fragment-level Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kangjie Zheng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Siyue Liang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junwei Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bin Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zequn Liu</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+W">Wei Ju</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05569v1-abstract-short" style="display: inline;"> SMILES, a crucial textual representation of molecular structures, has garnered significant attention as a foundation for pre-trained language models (LMs). However, most existing pre-trained SMILES LMs focus solely on the single-token level supervision during pre-training, failing to fully leverage the substructural information of molecules. This limitation makes the pre-training task overly simpl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05569v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05569v1-abstract-full" style="display: none;"> SMILES, a crucial textual representation of molecular structures, has garnered significant attention as a foundation for pre-trained language models (LMs). However, most existing pre-trained SMILES LMs focus solely on the single-token level supervision during pre-training, failing to fully leverage the substructural information of molecules. This limitation makes the pre-training task overly simplistic, preventing the models from capturing richer molecular semantic information. Moreover, during pre-training, these SMILES LMs only process corrupted SMILES inputs, never encountering any valid SMILES, which leads to a train-inference mismatch. To address these challenges, we propose SMI-Editor, a novel edit-based pre-trained SMILES LM. SMI-Editor disrupts substructures within a molecule at random and feeds the resulting SMILES back into the model, which then attempts to restore the original SMILES through an editing process. This approach not only introduces fragment-level training signals, but also enables the use of valid SMILES as inputs, allowing the model to learn how to reconstruct complete molecules from these incomplete structures. As a result, the model demonstrates improved scalability and an enhanced ability to capture fragment-level molecular information. Experimental results show that SMI-Editor achieves state-of-the-art performance across multiple downstream molecular tasks, and even outperforming several 3D molecular representation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05569v1-abstract-full').style.display = 'none'; document.getElementById('2412.05569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04784">arXiv:2412.04784</a> <span> [<a href="https://arxiv.org/pdf/2412.04784">pdf</a>, <a href="https://arxiv.org/format/2412.04784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> NLP-ADBench: NLP Anomaly Detection Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuangang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaqi Li</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhuo Xiao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tiankai Yang</a>, <a href="/search/cs?searchtype=author&query=Nian%2C+Y">Yi Nian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiyang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04784v1-abstract-short" style="display: inline;"> Anomaly detection (AD) is a critical machine learning task with diverse applications in web systems, including fraud detection, content moderation, and user behavior analysis. Despite its significance, AD in natural language processing (NLP) remains underexplored, limiting advancements in detecting anomalies in text data such as harmful content, phishing attempts, or spam reviews. In this paper, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04784v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04784v1-abstract-full" style="display: none;"> Anomaly detection (AD) is a critical machine learning task with diverse applications in web systems, including fraud detection, content moderation, and user behavior analysis. Despite its significance, AD in natural language processing (NLP) remains underexplored, limiting advancements in detecting anomalies in text data such as harmful content, phishing attempts, or spam reviews. In this paper, we introduce NLP-ADBench, the most comprehensive benchmark for NLP anomaly detection (NLP-AD), comprising eight curated datasets and evaluations of nineteen state-of-the-art algorithms. These include three end-to-end methods and sixteen two-step algorithms that apply traditional anomaly detection techniques to language embeddings generated by bert-base-uncased and OpenAI's text-embedding-3-large models. Our results reveal critical insights and future directions for NLP-AD. Notably, no single model excels across all datasets, highlighting the need for automated model selection. Moreover, two-step methods leveraging transformer-based embeddings consistently outperform specialized end-to-end approaches, with OpenAI embeddings demonstrating superior performance over BERT embeddings. By releasing NLP-ADBench at https://github.com/USC-FORTIS/NLP-ADBench, we provide a standardized framework for evaluating NLP-AD methods, fostering the development of innovative approaches. This work fills a crucial gap in the field and establishes a foundation for advancing NLP anomaly detection, particularly in the context of improving the safety and reliability of web-based systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04784v1-abstract-full').style.display = 'none'; document.getElementById('2412.04784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The project is available at https://github.com/USC-FORTIS/NLP-ADBench</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02689">arXiv:2412.02689</a> <span> [<a href="https://arxiv.org/pdf/2412.02689">pdf</a>, <a href="https://arxiv.org/format/2412.02689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Preliminary Investigation into Data Scaling Laws for Imitation Learning-Based End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yupeng Zheng</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zhongpu Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qichao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Teng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+B">Ben Lu</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+X">Xiaochuang Huo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+C">Chao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yixian Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Mengjie Yu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bu Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+P">Pengxuan Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yuhang Zheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haifeng Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+K">Ke Jiang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+P">Peng Jia</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+X">Xianpeng Lang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dongbin Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02689v1-abstract-short" style="display: inline;"> The end-to-end autonomous driving paradigm has recently attracted lots of attention due to its scalability. However, existing methods are constrained by the limited scale of real-world data, which hinders a comprehensive exploration of the scaling laws associated with end-to-end autonomous driving. To address this issue, we collected substantial data from various driving scenarios and behaviors an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02689v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02689v1-abstract-full" style="display: none;"> The end-to-end autonomous driving paradigm has recently attracted lots of attention due to its scalability. However, existing methods are constrained by the limited scale of real-world data, which hinders a comprehensive exploration of the scaling laws associated with end-to-end autonomous driving. To address this issue, we collected substantial data from various driving scenarios and behaviors and conducted an extensive study on the scaling laws of existing imitation learning-based end-to-end autonomous driving paradigms. Specifically, approximately 4 million demonstrations from 23 different scenario types were gathered, amounting to over 30,000 hours of driving demonstrations. We performed open-loop evaluations and closed-loop simulation evaluations in 1,400 diverse driving demonstrations (1,300 for open-loop and 100 for closed-loop) under stringent assessment conditions. Through experimental analysis, we discovered that (1) the performance of the driving model exhibits a power-law relationship with the amount of training data; (2) a small increase in the quantity of long-tailed data can significantly improve the performance for the corresponding scenarios; (3) appropriate scaling of data enables the model to achieve combinatorial generalization in novel scenes and actions. Our results highlight the critical role of data scaling in improving the generalizability of models across diverse autonomous driving scenarios, assuring safe deployment in the real world. Project repository: https://github.com/ucaszyp/Driving-Scaling-Law <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02689v1-abstract-full').style.display = 'none'; document.getElementById('2412.02689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01383">arXiv:2412.01383</a> <span> [<a href="https://arxiv.org/pdf/2412.01383">pdf</a>, <a href="https://arxiv.org/format/2412.01383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Second FRCSyn-onGoing: Winning Solutions and Post-Challenge Analysis to Improve Face Recognition with Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeAndres-Tame%2C+I">Ivan DeAndres-Tame</a>, <a href="/search/cs?searchtype=author&query=Tolosana%2C+R">Ruben Tolosana</a>, <a href="/search/cs?searchtype=author&query=Melzi%2C+P">Pietro Melzi</a>, <a href="/search/cs?searchtype=author&query=Vera-Rodriguez%2C+R">Ruben Vera-Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+M">Minchul Kim</a>, <a href="/search/cs?searchtype=author&query=Rathgeb%2C+C">Christian Rathgeb</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoming Liu</a>, <a href="/search/cs?searchtype=author&query=Gomez%2C+L+F">Luis F. Gomez</a>, <a href="/search/cs?searchtype=author&query=Morales%2C+A">Aythami Morales</a>, <a href="/search/cs?searchtype=author&query=Fierrez%2C+J">Julian Fierrez</a>, <a href="/search/cs?searchtype=author&query=Ortega-Garcia%2C+J">Javier Ortega-Garcia</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhizhou Zhong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuge Huang</a>, <a href="/search/cs?searchtype=author&query=Mi%2C+Y">Yuxi Mi</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shouhong Ding</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuigeng Zhou</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shuai He</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+L">Lingzhi Fu</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+H">Heng Cong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhihong Xiao</a>, <a href="/search/cs?searchtype=author&query=Smirnov%2C+E">Evgeny Smirnov</a>, <a href="/search/cs?searchtype=author&query=Pimenov%2C+A">Anton Pimenov</a>, <a href="/search/cs?searchtype=author&query=Grigorev%2C+A">Aleksei Grigorev</a>, <a href="/search/cs?searchtype=author&query=Timoshenko%2C+D">Denis Timoshenko</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01383v1-abstract-short" style="display: inline;"> Synthetic data is gaining increasing popularity for face recognition technologies, mainly due to the privacy concerns and challenges associated with obtaining real data, including diverse scenarios, quality, and demographic groups, among others. It also offers some advantages over real data, such as the large amount of data that can be generated or the ability to customize it to adapt to specific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01383v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01383v1-abstract-full" style="display: none;"> Synthetic data is gaining increasing popularity for face recognition technologies, mainly due to the privacy concerns and challenges associated with obtaining real data, including diverse scenarios, quality, and demographic groups, among others. It also offers some advantages over real data, such as the large amount of data that can be generated or the ability to customize it to adapt to specific problem-solving needs. To effectively use such data, face recognition models should also be specifically designed to exploit synthetic data to its fullest potential. In order to promote the proposal of novel Generative AI methods and synthetic data, and investigate the application of synthetic data to better train face recognition systems, we introduce the 2nd FRCSyn-onGoing challenge, based on the 2nd Face Recognition Challenge in the Era of Synthetic Data (FRCSyn), originally launched at CVPR 2024. This is an ongoing challenge that provides researchers with an accessible platform to benchmark i) the proposal of novel Generative AI methods and synthetic data, and ii) novel face recognition systems that are specifically proposed to take advantage of synthetic data. We focus on exploring the use of synthetic data both individually and in combination with real data to solve current challenges in face recognition such as demographic bias, domain adaptation, and performance constraints in demanding situations, such as age disparities between training and testing, changes in the pose, or occlusions. Very interesting findings are obtained in this second edition, including a direct comparison with the first one, in which synthetic databases were restricted to DCFace and GANDiffFace. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01383v1-abstract-full').style.display = 'none'; document.getElementById('2412.01383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Xia%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository