Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,541 results for author: <span class="mathjax">Song, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Song%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Song, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Song%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Song, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Song%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17129">arXiv:2502.17129</a> <span> [<a href="https://arxiv.org/pdf/2502.17129">pdf</a>, <a href="https://arxiv.org/format/2502.17129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Thus Spake Long-Context Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoran Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruixiao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mianqiu Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhigeng Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuerong Song</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Siyang He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linlin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yaqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17129v1-abstract-short" style="display: inline;"> Long context is an important topic in Natural Language Processing (NLP), running through the development of NLP architectures, and offers immense opportunities for Large Language Models (LLMs) giving LLMs the lifelong learning potential akin to humans. Unfortunately, the pursuit of a long context is accompanied by numerous obstacles. Nevertheless, long context remains a core competitive advantage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17129v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17129v1-abstract-full" style="display: none;"> Long context is an important topic in Natural Language Processing (NLP), running through the development of NLP architectures, and offers immense opportunities for Large Language Models (LLMs) giving LLMs the lifelong learning potential akin to humans. Unfortunately, the pursuit of a long context is accompanied by numerous obstacles. Nevertheless, long context remains a core competitive advantage for LLMs. In the past two years, the context length of LLMs has achieved a breakthrough extension to millions of tokens. Moreover, the research on long-context LLMs has expanded from length extrapolation to a comprehensive focus on architecture, infrastructure, training, and evaluation technologies. Inspired by the symphonic poem, Thus Spake Zarathustra, we draw an analogy between the journey of extending the context of LLM and the attempts of humans to transcend its mortality. In this survey, We will illustrate how LLM struggles between the tremendous need for a longer context and its equal need to accept the fact that it is ultimately finite. To achieve this, we give a global picture of the lifecycle of long-context LLMs from four perspectives: architecture, infrastructure, training, and evaluation, showcasing the full spectrum of long-context technologies. At the end of this survey, we will present 10 unanswered questions currently faced by long-context LLMs. We hope this survey can serve as a systematic introduction to the research on long-context LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17129v1-abstract-full').style.display = 'none'; document.getElementById('2502.17129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">a global picture of the lifecycle of long-context LLMs from four perspectives: architecture, infrastructure, training, and evaluation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17041">arXiv:2502.17041</a> <span> [<a href="https://arxiv.org/pdf/2502.17041">pdf</a>, <a href="https://arxiv.org/format/2502.17041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PrivaCI-Bench: Evaluating Privacy with Contextual Integrity and Legal Compliance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenbin Hu</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+H">Huihao Jing</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulin Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qi Hu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Sirui Han</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+T">Tianshu Chu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Peizhao Hu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17041v1-abstract-short" style="display: inline;"> Recent advancements in generative large language models (LLMs) have enabled wider applicability, accessibility, and flexibility. However, their reliability and trustworthiness are still in doubt, especially for concerns regarding individuals' data privacy. Great efforts have been made on privacy by building various evaluation benchmarks to study LLMs' privacy awareness and robustness from their ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17041v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17041v1-abstract-full" style="display: none;"> Recent advancements in generative large language models (LLMs) have enabled wider applicability, accessibility, and flexibility. However, their reliability and trustworthiness are still in doubt, especially for concerns regarding individuals' data privacy. Great efforts have been made on privacy by building various evaluation benchmarks to study LLMs' privacy awareness and robustness from their generated outputs to their hidden representations. Unfortunately, most of these works adopt a narrow formulation of privacy and only investigate personally identifiable information (PII). In this paper, we follow the merit of the Contextual Integrity (CI) theory, which posits that privacy evaluation should not only cover the transmitted attributes but also encompass the whole relevant social context through private information flows. We present PrivaCI-Bench, a comprehensive contextual privacy evaluation benchmark targeted at legal compliance to cover well-annotated privacy and safety regulations, real court cases, privacy policies, and synthetic data built from the official toolkit to study LLMs' privacy and safety compliance. We evaluate the latest LLMs, including the recent reasoner models QwQ-32B and Deepseek R1. Our experimental results suggest that though LLMs can effectively capture key CI parameters inside a given context, they still require further advancements for privacy compliance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17041v1-abstract-full').style.display = 'none'; document.getElementById('2502.17041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Webpage: https://hkust-knowcomp.github.io/privacy/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16580">arXiv:2502.16580</a> <span> [<a href="https://arxiv.org/pdf/2502.16580">pdf</a>, <a href="https://arxiv.org/format/2502.16580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Can Indirect Prompt Injection Attacks Be Detected and Removed? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoran Li</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Y">Yuan Sui</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yufei He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16580v1-abstract-short" style="display: inline;"> Prompt injection attacks manipulate large language models (LLMs) by misleading them to deviate from the original input instructions and execute maliciously injected instructions, because of their instruction-following capabilities and inability to distinguish between the original input instructions and maliciously injected instructions. To defend against such attacks, recent studies have developed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16580v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16580v1-abstract-full" style="display: none;"> Prompt injection attacks manipulate large language models (LLMs) by misleading them to deviate from the original input instructions and execute maliciously injected instructions, because of their instruction-following capabilities and inability to distinguish between the original input instructions and maliciously injected instructions. To defend against such attacks, recent studies have developed various detection mechanisms. While significant efforts have focused on detecting direct prompt injection attacks, where injected instructions are directly from the attacker who is also the user, limited attention has been given to indirect prompt injection attacks, where injected instructions are indirectly from external tools, such as a search engine. Moreover, current works mainly investigate injection detection methods and pay less attention to the post-processing method that aims to mitigate the injection after detection. In this paper, we investigate the feasibility of detecting and removing indirect prompt injection attacks, and we construct a benchmark dataset for evaluation. For detection, we assess the performance of existing LLMs and open-source detection models, and we further train detection models using our crafted training datasets. For removal, we evaluate two intuitive methods: (1) the segmentation removal method, which segments the injected document and removes parts containing injected instructions, and (2) the extraction removal method, which trains an extraction model to identify and remove injected instructions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16580v1-abstract-full').style.display = 'none'; document.getElementById('2502.16580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16444">arXiv:2502.16444</a> <span> [<a href="https://arxiv.org/pdf/2502.16444">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Atmospheric and Oceanic Physics">physics.ao-ph</span> </div> </div> <p class="title is-5 mathjax"> Update hydrological states or meteorological forcings? Comparing data assimilation methods for differentiable hydrologic models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jamaat%2C+A">Amirmoez Jamaat</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yalan Song</a>, <a href="/search/cs?searchtype=author&query=Rahmani%2C+F">Farshid Rahmani</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiangtao Liu</a>, <a href="/search/cs?searchtype=author&query=Lawson%2C+K">Kathryn Lawson</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chaopeng Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16444v1-abstract-short" style="display: inline;"> Data assimilation (DA) enables hydrologic models to update their internal states using near-real-time observations for more accurate forecasts. With deep neural networks like long short-term memory (LSTM), using either lagged observations as inputs (called "data integration") or variational DA has shown success in improving forecasts. However, it is unclear which methods are performant or optimal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16444v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16444v1-abstract-full" style="display: none;"> Data assimilation (DA) enables hydrologic models to update their internal states using near-real-time observations for more accurate forecasts. With deep neural networks like long short-term memory (LSTM), using either lagged observations as inputs (called "data integration") or variational DA has shown success in improving forecasts. However, it is unclear which methods are performant or optimal for physics-informed machine learning ("differentiable") models, which represent only a small amount of physically-meaningful states while using deep networks to supply parameters or missing processes. Here we developed variational DA methods for differentiable models, including optimizing adjusters for just precipitation data, just model internal hydrological states, or both. Our results demonstrated that differentiable streamflow models using the CAMELS dataset can benefit strongly and equivalently from variational DA as LSTM, with one-day lead time median Nash-Sutcliffe efficiency (NSE) elevated from 0.75 to 0.82. The resulting forecast matched or outperformed LSTM with DA in the eastern, northwestern, and central Great Plains regions of the conterminous United States. Both precipitation and state adjusters were needed to achieve these results, with the latter being substantially more effective on its own, and the former adding moderate benefits for high flows. Our DA framework does not need systematic training data and could serve as a practical DA scheme for whole river networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16444v1-abstract-full').style.display = 'none'; document.getElementById('2502.16444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16268">arXiv:2502.16268</a> <span> [<a href="https://arxiv.org/pdf/2502.16268">pdf</a>, <a href="https://arxiv.org/format/2502.16268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ThinkBench: Dynamic Out-of-Distribution Evaluation for Robust LLM Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shulin Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuang Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+L">Leyang Cui</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingcheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Ying Wen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+K">Kun Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16268v1-abstract-short" style="display: inline;"> Evaluating large language models (LLMs) poses significant challenges, particularly due to issues of data contamination and the leakage of correct answers. To address these challenges, we introduce ThinkBench, a novel evaluation framework designed to evaluate LLMs' reasoning capability robustly. ThinkBench proposes a dynamic data generation method for constructing out-of-distribution (OOD) datasets… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16268v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16268v1-abstract-full" style="display: none;"> Evaluating large language models (LLMs) poses significant challenges, particularly due to issues of data contamination and the leakage of correct answers. To address these challenges, we introduce ThinkBench, a novel evaluation framework designed to evaluate LLMs' reasoning capability robustly. ThinkBench proposes a dynamic data generation method for constructing out-of-distribution (OOD) datasets and offers an OOD dataset that contains 2,912 samples drawn from reasoning tasks. ThinkBench unifies the evaluation of reasoning models and non-reasoning models. We evaluate 16 LLMs and 4 PRMs under identical experimental conditions and show that most of the LLMs' performance are far from robust and they face a certain level of data leakage. By dynamically generating OOD datasets, ThinkBench effectively provides a reliable evaluation of LLMs and reduces the impact of data contamination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16268v1-abstract-full').style.display = 'none'; document.getElementById('2502.16268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16169">arXiv:2502.16169</a> <span> [<a href="https://arxiv.org/pdf/2502.16169">pdf</a>, <a href="https://arxiv.org/format/2502.16169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Patterns Over Principles: The Fragility of Inductive Reasoning in LLMs under Noisy Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianshi Zheng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16169v1-abstract-short" style="display: inline;"> Inductive reasoning, a cornerstone of human cognition, enables generalization from limited data but hasn't yet been fully achieved by large language models (LLMs). While modern LLMs excel at reasoning tasks, their ability to maintain stable and consistent rule abstraction under imperfect observations remains underexplored. To fill this gap, in this work, we introduce Robust Rule Induction, a task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16169v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16169v1-abstract-full" style="display: none;"> Inductive reasoning, a cornerstone of human cognition, enables generalization from limited data but hasn't yet been fully achieved by large language models (LLMs). While modern LLMs excel at reasoning tasks, their ability to maintain stable and consistent rule abstraction under imperfect observations remains underexplored. To fill this gap, in this work, we introduce Robust Rule Induction, a task that evaluates LLMs' capability in inferring rules from data that are fused with noisy examples. To address this task, we further propose Sample-steered Rule Refinement (SRR), a method enhancing reasoning stability via observation diversification and execution-guided feedback. Experiments across arithmetic, cryptography, and list functions reveal: (1) SRR outperforms other methods with minimal performance degradation under noise; (2) Despite slight accuracy variation, LLMs exhibit instability under noise (e.g., 0% accuracy change with only 70% consistent score); (3) Counterfactual task gaps highlight LLMs' reliance on memorized patterns over genuine abstraction. Our findings challenge LLMs' reasoning robustness, revealing susceptibility to hypothesis drift and pattern overfitting, while providing empirical evidence critical for developing human-like inductive systems. Code and data are available at \href{https://github.com/lcy2723/Robust-Rule-Induction}{https://github.com/lcy2723/Robust-Rule-Induction}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16169v1-abstract-full').style.display = 'none'; document.getElementById('2502.16169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.16121">arXiv:2502.16121</a> <span> [<a href="https://arxiv.org/pdf/2502.16121">pdf</a>, <a href="https://arxiv.org/format/2502.16121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> From Target Tracking to Targeting Track -- Part II: Regularized Polynomial Trajectory Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tiancheng Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guchong Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.16121v1-abstract-short" style="display: inline;"> Target tracking entails the estimation of the evolution of the target state over time, namely the target trajectory. Different from the classical state space model, our series of studies, including this paper, model the collection of the target state as a stochastic process (SP) that is further decomposed into a deterministic part which represents the trend of the trajectory and a residual SP repr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16121v1-abstract-full').style.display = 'inline'; document.getElementById('2502.16121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.16121v1-abstract-full" style="display: none;"> Target tracking entails the estimation of the evolution of the target state over time, namely the target trajectory. Different from the classical state space model, our series of studies, including this paper, model the collection of the target state as a stochastic process (SP) that is further decomposed into a deterministic part which represents the trend of the trajectory and a residual SP representing the residual fitting error. Subsequently, the tracking problem is formulated as a learning problem regarding the trajectory SP for which a key part is to estimate a trajectory FoT (T-FoT) best fitting the measurements in time series. For this purpose, we consider the polynomial T-FoT and address the regularized polynomial T-FoT optimization employing two distinct regularization strategies seeking trade-off between the accuracy and simplicity. One limits the order of the polynomial and then the best choice is determined by grid searching in a narrow, bounded range while the other adopts $\ell_0$ norm regularization for which the hybrid Newton solver is employed. Simulation results obtained in both single and multiple maneuvering target scenarios demonstrate the effectiveness of our approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.16121v1-abstract-full').style.display = 'none'; document.getElementById('2502.16121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part II of a series of companion papers; 11 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15857">arXiv:2502.15857</a> <span> [<a href="https://arxiv.org/pdf/2502.15857">pdf</a>, <a href="https://arxiv.org/format/2502.15857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PPC-GPT: Federated Task-Specific Compression of Large Language Models via Pruning and Chain-of-Thought Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+T">Tao Fan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+G">Guoqiang Ma</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lixin Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15857v1-abstract-short" style="display: inline;"> Compressing Large Language Models (LLMs) into task-specific Small Language Models (SLMs) encounters two significant challenges: safeguarding domain-specific knowledge privacy and managing limited resources. To tackle these challenges, we propose PPC-GPT, a innovative privacy-preserving federated framework specifically designed for compressing LLMs into task-specific SLMs via pruning and Chain-of-T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15857v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15857v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15857v1-abstract-full" style="display: none;"> Compressing Large Language Models (LLMs) into task-specific Small Language Models (SLMs) encounters two significant challenges: safeguarding domain-specific knowledge privacy and managing limited resources. To tackle these challenges, we propose PPC-GPT, a innovative privacy-preserving federated framework specifically designed for compressing LLMs into task-specific SLMs via pruning and Chain-of-Thought (COT) distillation. PPC-GPT works on a server-client federated architecture, where the client sends differentially private (DP) perturbed task-specific data to the server's LLM. The LLM then generates synthetic data along with their corresponding rationales. This synthetic data is subsequently used for both LLM pruning and retraining processes. Additionally, we harness COT knowledge distillation, leveraging the synthetic data to further improve the retraining of structurally-pruned SLMs. Our experimental results demonstrate the effectiveness of PPC-GPT across various text generation tasks. By compressing LLMs into task-specific SLMs, PPC-GPT not only achieves competitive performance but also prioritizes data privacy protection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15857v1-abstract-full').style.display = 'none'; document.getElementById('2502.15857v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15842">arXiv:2502.15842</a> <span> [<a href="https://arxiv.org/pdf/2502.15842">pdf</a>, <a href="https://arxiv.org/format/2502.15842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> From Target Tracking to Targeting Track -- Part I: A Metric for Spatio-Temporal Trajectory Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tiancheng Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Hongqi Fan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingdong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15842v1-abstract-short" style="display: inline;"> In the realm of target tracking, performance evaluation plays a pivotal role in the design, comparison, and analytics of trackers. Compared with the traditional trajectory composed of a set of point-estimates obtained by a tracker in the measurement time-series, the trajectory that our series of studies including this paper pursued is given by a curve function of time (FoT). The trajectory FoT pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15842v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15842v1-abstract-full" style="display: none;"> In the realm of target tracking, performance evaluation plays a pivotal role in the design, comparison, and analytics of trackers. Compared with the traditional trajectory composed of a set of point-estimates obtained by a tracker in the measurement time-series, the trajectory that our series of studies including this paper pursued is given by a curve function of time (FoT). The trajectory FoT provides complete information of the movement of the target over time and can be used to infer the state corresponding to arbitrary time, not only at the measurement time. However, there are no metrics available for comparing and evaluating the trajectory FoT. To address this lacuna, we propose a metric denominated as the spatiotemporal-aligned trajectory integral distance (Star-ID). The StarID associates and aligns the estimated and actual trajectories in the spatio-temporal domain and distinguishes between the time-aligned and unaligned segments in calculating the spatial divergence including false alarm, miss-detection and localization errors. The effectiveness of the proposed distance metric and the time-averaged version is validated through theoretical analysis and numerical examples of a single target or multiple targets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15842v1-abstract-full').style.display = 'none'; document.getElementById('2502.15842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Part I of a series of companion papers; 11 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15344">arXiv:2502.15344</a> <span> [<a href="https://arxiv.org/pdf/2502.15344">pdf</a>, <a href="https://arxiv.org/format/2502.15344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Computation Tree Logic Guided Program Repair </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yahui Song</a>, <a href="/search/cs?searchtype=author&query=Mirchev%2C+M">Martin Mirchev</a>, <a href="/search/cs?searchtype=author&query=Roychoudhury%2C+A">Abhik Roychoudhury</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15344v1-abstract-short" style="display: inline;"> Temporal logics like Computation Tree Logic (CTL) have been widely used as expressive formalisms to capture rich behavioral specifications. CTL can express properties such as reachability, termination, invariants and responsiveness, which are difficult to test. This paper suggests a mechanism for the automated repair of infinite-state programs guided by CTL properties. Our produced patches avoid t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15344v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15344v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15344v1-abstract-full" style="display: none;"> Temporal logics like Computation Tree Logic (CTL) have been widely used as expressive formalisms to capture rich behavioral specifications. CTL can express properties such as reachability, termination, invariants and responsiveness, which are difficult to test. This paper suggests a mechanism for the automated repair of infinite-state programs guided by CTL properties. Our produced patches avoid the overfitting issue that occurs in test-suite-guided repair, where the repaired code may not pass tests outside the given test suite. To realize this vision, we propose a repair framework based on Datalog, a widely used domain-specific language for program analysis, which readily supports nested fixed-point semantics of CTL via stratified negation. Specifically, our framework encodes the program and CTL properties into Datalog facts and rules and performs the repair by modifying the facts to pass the analysis rules. Previous research proposed a generic repair mechanism for Datalog-based analysis in the form of Symbolic Execution of Datalog (SEDL). However, SEDL only supports positive Datalog, which is insufficient for expressing CTL properties. Thus, we extended SEDL to make it applicable to stratified Datalog. Moreover, liveness property violations involve infinite computations, which we handle via a novel loop summarization. Our approach achieves analysis accuracy of 56.6% on a small-scale benchmark and 88.5% on a real-world benchmark, outperforming the best baseline performances of 27.7% and 76.9%. Our approach repairs all detected bugs, which is not achieved by existing tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15344v1-abstract-full').style.display = 'none'; document.getElementById('2502.15344v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14614">arXiv:2502.14614</a> <span> [<a href="https://arxiv.org/pdf/2502.14614">pdf</a>, <a href="https://arxiv.org/format/2502.14614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> FIND: Fine-grained Information Density Guided Adaptive Retrieval-Augmented Generation for Disease Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+M">Mingyi Jia</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+J">Junwen Duan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yan Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianxin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14614v1-abstract-short" style="display: inline;"> Retrieval-Augmented Large Language Models (LLMs), which integrate external knowledge into LLMs, have shown remarkable performance in various medical domains, including clinical diagnosis. However, existing RAG methods struggle to effectively assess task difficulty to make retrieval decisions, thereby failing to meet the clinical requirements for balancing efficiency and accuracy. So in this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14614v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14614v1-abstract-full" style="display: none;"> Retrieval-Augmented Large Language Models (LLMs), which integrate external knowledge into LLMs, have shown remarkable performance in various medical domains, including clinical diagnosis. However, existing RAG methods struggle to effectively assess task difficulty to make retrieval decisions, thereby failing to meet the clinical requirements for balancing efficiency and accuracy. So in this paper, we propose FIND (\textbf{F}ine-grained \textbf{In}formation \textbf{D}ensity Guided Adaptive RAG), a novel framework that improves the reliability of RAG in disease diagnosis scenarios. FIND incorporates a fine-grained adaptive control module to determine whether retrieval is necessary based on the information density of the input. By optimizing the retrieval process and implementing a knowledge filtering module, FIND ensures that the retrieval is better suited to clinical scenarios. Experiments on three Chinese electronic medical record datasets demonstrate that FIND significantly outperforms various baseline methods, highlighting its effectiveness in clinical diagnosis tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14614v1-abstract-full').style.display = 'none'; document.getElementById('2502.14614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14397">arXiv:2502.14397</a> <span> [<a href="https://arxiv.org/pdf/2502.14397">pdf</a>, <a href="https://arxiv.org/format/2502.14397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PhotoDoodle: Learning Artistic Image Editing from Few-Shot Pairwise Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shijie Huang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiren Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hailong Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueyin Wang</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14397v2-abstract-short" style="display: inline;"> We introduce PhotoDoodle, a novel image editing framework designed to facilitate photo doodling by enabling artists to overlay decorative elements onto photographs. Photo doodling is challenging because the inserted elements must appear seamlessly integrated with the background, requiring realistic blending, perspective alignment, and contextual coherence. Additionally, the background must be pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14397v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14397v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14397v2-abstract-full" style="display: none;"> We introduce PhotoDoodle, a novel image editing framework designed to facilitate photo doodling by enabling artists to overlay decorative elements onto photographs. Photo doodling is challenging because the inserted elements must appear seamlessly integrated with the background, requiring realistic blending, perspective alignment, and contextual coherence. Additionally, the background must be preserved without distortion, and the artist's unique style must be captured efficiently from limited training data. These requirements are not addressed by previous methods that primarily focus on global style transfer or regional inpainting. The proposed method, PhotoDoodle, employs a two-stage training strategy. Initially, we train a general-purpose image editing model, OmniEditor, using large-scale data. Subsequently, we fine-tune this model with EditLoRA using a small, artist-curated dataset of before-and-after image pairs to capture distinct editing styles and techniques. To enhance consistency in the generated results, we introduce a positional encoding reuse mechanism. Additionally, we release a PhotoDoodle dataset featuring six high-quality styles. Extensive experiments demonstrate the advanced performance and robustness of our method in customized image editing, opening new possibilities for artistic creation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14397v2-abstract-full').style.display = 'none'; document.getElementById('2502.14397v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13416">arXiv:2502.13416</a> <span> [<a href="https://arxiv.org/pdf/2502.13416">pdf</a>, <a href="https://arxiv.org/format/2502.13416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Detecting LLM Fact-conflicting Hallucinations Enhanced by Temporal-logic-based Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+N">Ningke Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yahui Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kailong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuekang Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Ling Shi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13416v1-abstract-short" style="display: inline;"> Large language models (LLMs) face the challenge of hallucinations -- outputs that seem coherent but are actually incorrect. A particularly damaging type is fact-conflicting hallucination (FCH), where generated content contradicts established facts. Addressing FCH presents three main challenges: 1) Automatically constructing and maintaining large-scale benchmark datasets is difficult and resource-i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13416v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13416v1-abstract-full" style="display: none;"> Large language models (LLMs) face the challenge of hallucinations -- outputs that seem coherent but are actually incorrect. A particularly damaging type is fact-conflicting hallucination (FCH), where generated content contradicts established facts. Addressing FCH presents three main challenges: 1) Automatically constructing and maintaining large-scale benchmark datasets is difficult and resource-intensive; 2) Generating complex and efficient test cases that the LLM has not been trained on -- especially those involving intricate temporal features -- is challenging, yet crucial for eliciting hallucinations; and 3) Validating the reasoning behind LLM outputs is inherently difficult, particularly with complex logical relationships, as it requires transparency in the model's decision-making process. This paper presents Drowzee, an innovative end-to-end metamorphic testing framework that utilizes temporal logic to identify fact-conflicting hallucinations (FCH) in large language models (LLMs). Drowzee builds a comprehensive factual knowledge base by crawling sources like Wikipedia and uses automated temporal-logic reasoning to convert this knowledge into a large, extensible set of test cases with ground truth answers. LLMs are tested using these cases through template-based prompts, which require them to generate both answers and reasoning steps. To validate the reasoning, we propose two semantic-aware oracles that compare the semantic structure of LLM outputs to the ground truths. Across nine LLMs in nine different knowledge domains, experimental results show that Drowzee effectively identifies rates of non-temporal-related hallucinations ranging from 24.7% to 59.8%, and rates of temporal-related hallucinations ranging from 16.7% to 39.2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13416v1-abstract-full').style.display = 'none'; document.getElementById('2502.13416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, under review. arXiv admin note: substantial text overlap with arXiv:2405.00648</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12396">arXiv:2502.12396</a> <span> [<a href="https://arxiv.org/pdf/2502.12396">pdf</a>, <a href="https://arxiv.org/format/2502.12396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scientific Machine Learning of Flow Resistance Using Universal Shallow Water Equations with Differentiable Programming </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaofeng Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yalan Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12396v1-abstract-short" style="display: inline;"> Shallow water equations (SWEs) are the backbone of most hydrodynamics models for flood prediction, river engineering, and many other water resources applications. The estimation of flow resistance, i.e., the Manning's roughness coefficient $n$, is crucial for ensuring model accuracy, and has been previously determined using empirical formulas or tables. To better account for temporal and spatial v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12396v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12396v1-abstract-full" style="display: none;"> Shallow water equations (SWEs) are the backbone of most hydrodynamics models for flood prediction, river engineering, and many other water resources applications. The estimation of flow resistance, i.e., the Manning's roughness coefficient $n$, is crucial for ensuring model accuracy, and has been previously determined using empirical formulas or tables. To better account for temporal and spatial variability in channel roughness, inverse modeling of $n$ using observed flow data is more reliable and adaptable; however, it is challenging when using traditional SWE solvers. Based on the concept of universal differential equation (UDE), which combines physics-based differential equations with neural networks (NNs), we developed a universal SWEs (USWEs) solver, Hydrograd, for hybrid hydrodynamics modeling. It can do accurate forward simulations, support automatic differentiation (AD) for gradient-based sensitivity analysis and parameter inversion, and perform scientific machine learning for physics discovery. In this work, we first validated the accuracy of its forward modeling, then applied a real-world case to demonstrate the ability of USWEs to capture model sensitivity (gradients) and perform inverse modeling of Manning's $n$. Furthermore, we used a NN to learn a universal relationship between $n$, hydraulic parameters, and flow in a real river channel. Unlike inverse modeling using surrogate models, Hydrograd uses a two-dimensional SWEs solver as its physics backbone, which eliminates the need for data-intensive pretraining and resolves the generalization problem when applied to out-of-sample scenarios. This differentiable modeling approach, with seamless integration with NNs, provides a new pathway for solving complex inverse problems and discovering new physics in hydrodynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12396v1-abstract-full').style.display = 'none'; document.getElementById('2502.12396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11544">arXiv:2502.11544</a> <span> [<a href="https://arxiv.org/pdf/2502.11544">pdf</a>, <a href="https://arxiv.org/format/2502.11544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Evaluating o1-Like LLMs: Unlocking Reasoning for Translation through Comprehensive Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+A">Andong Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuchen Song</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenxin Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kehai Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Muyun Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiejun Zhao</a>, <a href="/search/cs?searchtype=author&query=zhang%2C+M">Min zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11544v1-abstract-short" style="display: inline;"> The o1-Like LLMs are transforming AI by simulating human cognitive processes, but their performance in multilingual machine translation (MMT) remains underexplored. This study examines: (1) how o1-Like LLMs perform in MMT tasks and (2) what factors influence their translation quality. We evaluate multiple o1-Like LLMs and compare them with traditional models like ChatGPT and GPT-4o. Results show t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11544v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11544v1-abstract-full" style="display: none;"> The o1-Like LLMs are transforming AI by simulating human cognitive processes, but their performance in multilingual machine translation (MMT) remains underexplored. This study examines: (1) how o1-Like LLMs perform in MMT tasks and (2) what factors influence their translation quality. We evaluate multiple o1-Like LLMs and compare them with traditional models like ChatGPT and GPT-4o. Results show that o1-Like LLMs establish new multilingual translation benchmarks, with DeepSeek-R1 surpassing GPT-4o in contextless tasks. They demonstrate strengths in historical and cultural translation but exhibit a tendency for rambling issues in Chinese-centric outputs. Further analysis reveals three key insights: (1) High inference costs and slower processing speeds make complex translation tasks more resource-intensive. (2) Translation quality improves with model size, enhancing commonsense reasoning and cultural translation. (3) The temperature parameter significantly impacts output quality-lower temperatures yield more stable and accurate translations, while higher temperatures reduce coherence and precision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11544v1-abstract-full').style.display = 'none'; document.getElementById('2502.11544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11478">arXiv:2502.11478</a> <span> [<a href="https://arxiv.org/pdf/2502.11478">pdf</a>, <a href="https://arxiv.org/format/2502.11478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TAPS: Throat and Acoustic Paired Speech Dataset for Deep Learning-Based Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+Y">Yunsik Kim</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yonghun Song</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+Y">Yoonyoung Chung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11478v1-abstract-short" style="display: inline;"> In high-noise environments such as factories, subways, and busy streets, capturing clear speech is challenging due to background noise. Throat microphones provide a solution with their noise-suppressing properties, reducing the noise while recording speech. However, a significant limitation remains: high-frequency information is attenuated as sound waves pass through skin and tissue, reducing spee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11478v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11478v1-abstract-full" style="display: none;"> In high-noise environments such as factories, subways, and busy streets, capturing clear speech is challenging due to background noise. Throat microphones provide a solution with their noise-suppressing properties, reducing the noise while recording speech. However, a significant limitation remains: high-frequency information is attenuated as sound waves pass through skin and tissue, reducing speech clarity. Recent deep learning approaches have shown promise in enhancing throat microphone recordings, but further progress is constrained by the absence of standardized dataset. We introduce a throat and acoustic paired speech dataset (TAPS), a collection of paired utterances recorded from 60 native Korean speakers using throat and acoustic microphones. To demonstrate the TAPS's utility, we tested three baseline deep learning models and identified the mapping-based approach as superior in improving speech quality and restoring content. Additionally, we propose an optimal method to mitigate the signal mismatch between throat and acoustic microphones, ensuring model performance. These results highlight the potential of TAPS to serve as a standardized dataset and advance research in throat microphone-based speech enhancement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11478v1-abstract-full').style.display = 'none'; document.getElementById('2502.11478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11410">arXiv:2502.11410</a> <span> [<a href="https://arxiv.org/pdf/2502.11410">pdf</a>, <a href="https://arxiv.org/format/2502.11410">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Structure based SAT dataset for analysing GNN generalisation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yi Fu</a>, <a href="/search/cs?searchtype=author&query=Tompkins%2C+A">Anthony Tompkins</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yang Song</a>, <a href="/search/cs?searchtype=author&query=Pagnucco%2C+M">Maurice Pagnucco</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11410v1-abstract-short" style="display: inline;"> Satisfiability (SAT) solvers based on techniques such as conflict driven clause learning (CDCL) have produced excellent performance on both synthetic and real world industrial problems. While these CDCL solvers only operate on a per-problem basis, graph neural network (GNN) based solvers bring new benefits to the field by allowing practitioners to exploit knowledge gained from solved problems to e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11410v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11410v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11410v1-abstract-full" style="display: none;"> Satisfiability (SAT) solvers based on techniques such as conflict driven clause learning (CDCL) have produced excellent performance on both synthetic and real world industrial problems. While these CDCL solvers only operate on a per-problem basis, graph neural network (GNN) based solvers bring new benefits to the field by allowing practitioners to exploit knowledge gained from solved problems to expedite solving of new SAT problems. However, one specific area that is often studied in the context of CDCL solvers, but largely overlooked in GNN solvers, is the relationship between graph theoretic measure of structure in SAT problems and the generalisation ability of GNN solvers. To bridge the gap between structural graph properties (e.g., modularity, self-similarity) and the generalisability (or lack thereof) of GNN based SAT solvers, we present StructureSAT: a curated dataset, along with code to further generate novel examples, containing a diverse set of SAT problems from well known problem domains. Furthermore, we utilise a novel splitting method that focuses on deconstructing the families into more detailed hierarchies based on their structural properties. With the new dataset, we aim to help explain problematic generalisation in existing GNN SAT solvers by exploiting knowledge of structural graph properties. We conclude with multiple future directions that can help researchers in GNN based SAT solving develop more effective and generalisable SAT solvers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11410v1-abstract-full').style.display = 'none'; document.getElementById('2502.11410v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in 28th International Conference on Artificial Intelligence and Statistics (AISTATS) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11201">arXiv:2502.11201</a> <span> [<a href="https://arxiv.org/pdf/2502.11201">pdf</a>, <a href="https://arxiv.org/format/2502.11201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap: Enabling Natural Language Queries for NoSQL Databases through Text-to-NoSQL Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinwei Lu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhiqian Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haodi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+R+C">Raymond Chi-Wing Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11201v2-abstract-short" style="display: inline;"> NoSQL databases have become increasingly popular due to their outstanding performance in handling large-scale, unstructured, and semi-structured data, highlighting the need for user-friendly interfaces to bridge the gap between non-technical users and complex database queries. In this paper, we introduce the Text-to-NoSQL task, which aims to convert natural language queries into NoSQL queries, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11201v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11201v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11201v2-abstract-full" style="display: none;"> NoSQL databases have become increasingly popular due to their outstanding performance in handling large-scale, unstructured, and semi-structured data, highlighting the need for user-friendly interfaces to bridge the gap between non-technical users and complex database queries. In this paper, we introduce the Text-to-NoSQL task, which aims to convert natural language queries into NoSQL queries, thereby lowering the technical barrier for non-expert users. To promote research in this area, we developed a novel automated dataset construction process and released a large-scale and open-source dataset for this task, named TEND (short for Text-to-NoSQL Dataset). Additionally, we designed a SLM (Small Language Model)-assisted and RAG (Retrieval-augmented Generation)-assisted multi-step framework called SMART, which is specifically designed for Text-to-NoSQL conversion. To ensure comprehensive evaluation of the models, we also introduced a detailed set of metrics that assess the model's performance from both the query itself and its execution results. Our experimental results demonstrate the effectiveness of our approach and establish a benchmark for future research in this emerging field. We believe that our contributions will pave the way for more accessible and intuitive interactions with NoSQL databases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11201v2-abstract-full').style.display = 'none'; document.getElementById('2502.11201v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11176">arXiv:2502.11176</a> <span> [<a href="https://arxiv.org/pdf/2502.11176">pdf</a>, <a href="https://arxiv.org/format/2502.11176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LogiDynamics: Unraveling the Dynamics of Logical Inference in Large Language Model Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianshi Zheng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiayang Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyang Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haochen Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jiaxin Bai</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+G+Y">Ginny Y. Wong</a>, <a href="/search/cs?searchtype=author&query=See%2C+S">Simon See</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11176v1-abstract-short" style="display: inline;"> Modern large language models (LLMs) employ various forms of logical inference, both implicitly and explicitly, when addressing reasoning tasks. Understanding how to optimally leverage these inference paradigms is critical for advancing LLMs' reasoning capabilities. This paper adopts an exploratory approach by introducing a controlled evaluation environment for analogical reasoning -- a fundamental… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11176v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11176v1-abstract-full" style="display: none;"> Modern large language models (LLMs) employ various forms of logical inference, both implicitly and explicitly, when addressing reasoning tasks. Understanding how to optimally leverage these inference paradigms is critical for advancing LLMs' reasoning capabilities. This paper adopts an exploratory approach by introducing a controlled evaluation environment for analogical reasoning -- a fundamental cognitive task -- that is systematically parameterized across three dimensions: modality (textual, visual, symbolic), difficulty (easy, medium, hard), and task format (multiple-choice or free-text generation). We analyze the comparative dynamics of inductive, abductive, and deductive inference pipelines across these dimensions, and demonstrate that our findings generalize to broader in-context learning tasks. Additionally, we investigate advanced paradigms such as hypothesis selection, verification, and refinement, revealing their potential to scale up logical inference in LLM reasoning. This exploratory study provides a foundation for future research in enhancing LLM reasoning through systematic logical inference strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11176v1-abstract-full').style.display = 'none'; document.getElementById('2502.11176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11022">arXiv:2502.11022</a> <span> [<a href="https://arxiv.org/pdf/2502.11022">pdf</a>, <a href="https://arxiv.org/format/2502.11022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MultiTEND: A Multilingual Benchmark for Natural Language to NoSQL Query Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhiqian Qin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanfeng Song</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jinwei Lu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuanwei Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaimin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+J">Chen Jason Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11022v1-abstract-short" style="display: inline;"> Natural language interfaces for NoSQL databases are increasingly vital in the big data era, enabling users to interact with complex, unstructured data without deep technical expertise. However, most recent advancements focus on English, leaving a gap for multilingual support. This paper introduces MultiTEND, the first and largest multilingual benchmark for natural language to NoSQL query generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11022v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11022v1-abstract-full" style="display: none;"> Natural language interfaces for NoSQL databases are increasingly vital in the big data era, enabling users to interact with complex, unstructured data without deep technical expertise. However, most recent advancements focus on English, leaving a gap for multilingual support. This paper introduces MultiTEND, the first and largest multilingual benchmark for natural language to NoSQL query generation, covering six languages: English, German, French, Russian, Japanese and Mandarin Chinese. Using MultiTEND, we analyze challenges in translating natural language to NoSQL queries across diverse linguistic structures, including lexical and syntactic differences. Experiments show that performance accuracy in both English and non-English settings remains relatively low, with a 4%-6% gap across scenarios like fine-tuned SLM, zero-shot LLM, and RAG for LLM. To address the aforementioned challenges, we introduce MultiLink, a novel framework that bridges the multilingual input to NoSQL query generation gap through a Parallel Linking Process. It breaks down the task into multiple steps, integrating parallel multilingual processing, Chain-of-Thought (CoT) reasoning, and Retrieval-Augmented Generation (RAG) to tackle lexical and structural challenges inherent in multilingual NoSQL generation. MultiLink shows enhancements in all metrics for every language against the top baseline, boosting execution accuracy by about 15% for English and averaging a 10% improvement for non-English languages. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11022v1-abstract-full').style.display = 'none'; document.getElementById('2502.11022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10783">arXiv:2502.10783</a> <span> [<a href="https://arxiv.org/pdf/2502.10783">pdf</a>, <a href="https://arxiv.org/format/2502.10783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Astrophysics of Galaxies">astro-ph.GA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Solar and Stellar Astrophysics">astro-ph.SR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Image Pre-Processing Framework for Time-Domain Astronomy in the Artificial Intelligence Era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+L">Liang Cao</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+P">Peng Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxin Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yu Song</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+C">Chengkun Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yushan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10783v1-abstract-short" style="display: inline;"> The rapid advancement of image analysis methods in time-domain astronomy, particularly those leveraging AI algorithms, has highlighted efficient image pre-processing as a critical bottleneck affecting algorithm performance. Image pre-processing, which involves standardizing images for training or deployment of various AI algorithms, encompasses essential steps such as image quality evaluation, ali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10783v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10783v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10783v1-abstract-full" style="display: none;"> The rapid advancement of image analysis methods in time-domain astronomy, particularly those leveraging AI algorithms, has highlighted efficient image pre-processing as a critical bottleneck affecting algorithm performance. Image pre-processing, which involves standardizing images for training or deployment of various AI algorithms, encompasses essential steps such as image quality evaluation, alignment, stacking, background extraction, gray-scale transformation, cropping, source detection, astrometry, and photometry. Historically, these algorithms were developed independently by different research groups, primarily based on CPU architecture for small-scale data processing. This paper introduces a novel framework for image pre-processing that integrates key algorithms specifically modified for GPU architecture, enabling large-scale image pre-processing for different algorithms. To prepare for the new algorithm design paradigm in the AI era, we have implemented two operational modes in the framework for different application scenarios: Eager mode and Pipeline mode. The Eager mode facilitates real-time feedback and flexible adjustments, which could be used for parameter tuning and algorithm development. The pipeline mode is primarily designed for large scale data processing, which could be used for training or deploying of artificial intelligence models. We have tested the performance of our framework using simulated and real observation images. Results demonstrate that our framework significantly enhances image pre-processing speed while maintaining accuracy levels comparable to CPU based algorithms. To promote accessibility and ease of use, a Docker version of our framework is available for download in the PaperData Repository powered by China-VO, compatible with various AI algorithms developed for time-domain astronomy research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10783v1-abstract-full').style.display = 'none'; document.getElementById('2502.10783v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the AJ. The Docker Version could be found in the PaperData Repository powered by China-VO</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10563">arXiv:2502.10563</a> <span> [<a href="https://arxiv.org/pdf/2502.10563">pdf</a>, <a href="https://arxiv.org/format/2502.10563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Unbiased LLM Evaluation via Synthetic Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhaoyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuda Song</a>, <a href="/search/cs?searchtype=author&query=Zanette%2C+A">Andrea Zanette</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10563v1-abstract-short" style="display: inline;"> When developing new large language models (LLMs), a key step is evaluating their final performance, often by computing the win-rate against a reference model based on external feedback. Human feedback is the gold standard, particularly for capturing nuanced qualities like coherence, readability, and alignment with human expectations. However, human evaluations are costly -- even for large tech com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10563v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10563v1-abstract-full" style="display: none;"> When developing new large language models (LLMs), a key step is evaluating their final performance, often by computing the win-rate against a reference model based on external feedback. Human feedback is the gold standard, particularly for capturing nuanced qualities like coherence, readability, and alignment with human expectations. However, human evaluations are costly -- even for large tech companies -- and when conducted with active users, they may negatively impact user experience. A promising alternative is synthetic feedback, where evaluations are conducted by other large language models, including reward models. While this eliminates the need for costly human annotations, it introduces biases that may distort the evaluation process. In this work, we propose a statistically principled framework that integrates human and synthetic feedback to reduce reliance on human annotations while maintaining unbiased win-rate calculations. Our experiments demonstrate a reduction in human annotations by up to 12.2% with an off-the-shelf synthetic evaluator and up to 24.8% with a finetuned variant. Apart from being generalizable, scalable, and free of hyper-parameter tuning, our method offers predictable annotation savings, which can be estimated based on data-dependent characteristics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10563v1-abstract-full').style.display = 'none'; document.getElementById('2502.10563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09125">arXiv:2502.09125</a> <span> [<a href="https://arxiv.org/pdf/2502.09125">pdf</a>, <a href="https://arxiv.org/format/2502.09125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Automatic Pruning via Structured Lasso with Class-wise Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingchen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xia Li</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Leigang Qu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Z">Zifan Peng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yijun Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zemin Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linshan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09125v1-abstract-short" style="display: inline;"> Most pruning methods concentrate on unimportant filters of neural networks. However, they face the loss of statistical information due to a lack of consideration for class-wise data. In this paper, from the perspective of leveraging precise class-wise information for model pruning, we utilize structured lasso with guidance from Information Bottleneck theory. Our approach ensures that statistical i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09125v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09125v1-abstract-full" style="display: none;"> Most pruning methods concentrate on unimportant filters of neural networks. However, they face the loss of statistical information due to a lack of consideration for class-wise data. In this paper, from the perspective of leveraging precise class-wise information for model pruning, we utilize structured lasso with guidance from Information Bottleneck theory. Our approach ensures that statistical information is retained during the pruning process. With these techniques, we introduce two innovative adaptive network pruning schemes: sparse graph-structured lasso pruning with Information Bottleneck (\textbf{sGLP-IB}) and sparse tree-guided lasso pruning with Information Bottleneck (\textbf{sTLP-IB}). The key aspect is pruning model filters using sGLP-IB and sTLP-IB to better capture class-wise relatedness. Compared to multiple state-of-the-art methods, our approaches demonstrate superior performance across three datasets and six model architectures in extensive experiments. For instance, using the VGG16 model on the CIFAR-10 dataset, we achieve a parameter reduction of 85%, a decrease in FLOPs by 61%, and maintain an accuracy of 94.10% (0.14% higher than the original model); we reduce the parameters by 55% with the accuracy at 76.12% using the ResNet architecture on ImageNet (only drops 0.03%). In summary, we successfully reduce model size and computational resource usage while maintaining accuracy. Our codes are at https://anonymous.4open.science/r/IJCAI-8104. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09125v1-abstract-full').style.display = 'none'; document.getElementById('2502.09125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09104">arXiv:2502.09104</a> <span> [<a href="https://arxiv.org/pdf/2502.09104">pdf</a>, <a href="https://arxiv.org/ps/2502.09104">ps</a>, <a href="https://arxiv.org/format/2502.09104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> One-shot Federated Learning Methods: A Practical Guide </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhenheng Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xia Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yijun Song</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+S">Sijie Ji</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zemin Liu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Linshan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09104v1-abstract-short" style="display: inline;"> One-shot Federated Learning (OFL) is a distributed machine learning paradigm that constrains client-server communication to a single round, addressing privacy and communication overhead issues associated with multiple rounds of data exchange in traditional Federated Learning (FL). OFL demonstrates the practical potential for integration with future approaches that require collaborative training mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09104v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09104v1-abstract-full" style="display: none;"> One-shot Federated Learning (OFL) is a distributed machine learning paradigm that constrains client-server communication to a single round, addressing privacy and communication overhead issues associated with multiple rounds of data exchange in traditional Federated Learning (FL). OFL demonstrates the practical potential for integration with future approaches that require collaborative training models, such as large language models (LLMs). However, current OFL methods face two major challenges: data heterogeneity and model heterogeneity, which result in subpar performance compared to conventional FL methods. Worse still, despite numerous studies addressing these limitations, a comprehensive summary is still lacking. To address these gaps, this paper presents a systematic analysis of the challenges faced by OFL and thoroughly reviews the current methods. We also offer an innovative categorization method and analyze the trade-offs of various techniques. Additionally, we discuss the most promising future directions and the technologies that should be integrated into the OFL field. This work aims to provide guidance and insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09104v1-abstract-full').style.display = 'none'; document.getElementById('2502.09104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09101">arXiv:2502.09101</a> <span> [<a href="https://arxiv.org/pdf/2502.09101">pdf</a>, <a href="https://arxiv.org/format/2502.09101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap Between LLMs and Human Intentions: Progresses and Challenges in Instruction Understanding, Intention Reasoning, and Reliable Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+Z">Zongyu Chang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+F">Feihong Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Ziqin Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+C">Cheng Ji</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruifeng Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangguang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09101v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated exceptional capabilities in understanding and generation. However, when interacting with human instructions in real-world scenarios, LLMs still face significant challenges, particularly in accurately capturing and comprehending human instructions and intentions. This paper focuses on three challenges in LLM-based text generation tasks: instruction und… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09101v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09101v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated exceptional capabilities in understanding and generation. However, when interacting with human instructions in real-world scenarios, LLMs still face significant challenges, particularly in accurately capturing and comprehending human instructions and intentions. This paper focuses on three challenges in LLM-based text generation tasks: instruction understanding, intention reasoning, and reliable generation. Regarding human complex instruction, LLMs have deficiencies in understanding long contexts and instructions in multi-round conversations. For intention reasoning, LLMs may have inconsistent command reasoning, difficulty reasoning about commands containing incorrect information, difficulty understanding user ambiguous language commands, and a weak understanding of user intention in commands. Besides, In terms of reliable generation, LLMs may have unstable generated content and unethical generation. To this end, we classify and analyze the performance of LLMs in challenging scenarios and conduct a comprehensive evaluation of existing solutions. Furthermore, we introduce benchmarks and categorize them based on the aforementioned three core challenges. Finally, we explore potential directions for future research to enhance the reliability and adaptability of LLMs in real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09101v1-abstract-full').style.display = 'none'; document.getElementById('2502.09101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08211">arXiv:2502.08211</a> <span> [<a href="https://arxiv.org/pdf/2502.08211">pdf</a>, <a href="https://arxiv.org/format/2502.08211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quality over Quantity: Boosting Data Efficiency Through Ensembled Multimodal Data Curation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinda Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuhao Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Daming Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Weiwei Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minghua Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kangliang Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinya Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08211v1-abstract-short" style="display: inline;"> In an era overwhelmed by vast amounts of data, the effective curation of web-crawl datasets is essential for optimizing model performance. This paper tackles the challenges associated with the unstructured and heterogeneous nature of such datasets. Traditional heuristic curation methods often inadequately capture complex features, resulting in biases and the exclusion of relevant data. We introduc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08211v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08211v1-abstract-full" style="display: none;"> In an era overwhelmed by vast amounts of data, the effective curation of web-crawl datasets is essential for optimizing model performance. This paper tackles the challenges associated with the unstructured and heterogeneous nature of such datasets. Traditional heuristic curation methods often inadequately capture complex features, resulting in biases and the exclusion of relevant data. We introduce an advanced, learning-driven approach, Ensemble Curation Of DAta ThroUgh Multimodal Operators (EcoDatum), incorporating a novel quality-guided deduplication method to ensure balanced feature distributions. EcoDatum strategically integrates various unimodal and multimodal data curation operators within a weak supervision ensemble framework, utilizing automated optimization to score each data point effectively. EcoDatum, which significantly improves the data curation quality and efficiency, outperforms existing state-of-the-art (SOTA) techniques, ranked 1st on the DataComp leaderboard, with an average performance score of 0.182 across 38 diverse evaluation datasets. This represents a 28% improvement over the DataComp baseline method, demonstrating its effectiveness in improving dataset curation and model training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08211v1-abstract-full').style.display = 'none'; document.getElementById('2502.08211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06882">arXiv:2502.06882</a> <span> [<a href="https://arxiv.org/pdf/2502.06882">pdf</a>, <a href="https://arxiv.org/format/2502.06882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Simulator Drives Language Models for Legal Intensive Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+S">Shengbin Yue</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Ting Huang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Z">Zheng Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Siyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shujun Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yun Song</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06882v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have significantly advanced legal intelligence, but the scarcity of scenario data impedes the progress toward interactive legal scenarios. This paper introduces a Multi-agent Legal Simulation Driver (MASER) to scalably generate synthetic data by simulating interactive legal scenarios. Leveraging real-legal case sources, MASER ensures the consistency of legal attributes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06882v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06882v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have significantly advanced legal intelligence, but the scarcity of scenario data impedes the progress toward interactive legal scenarios. This paper introduces a Multi-agent Legal Simulation Driver (MASER) to scalably generate synthetic data by simulating interactive legal scenarios. Leveraging real-legal case sources, MASER ensures the consistency of legal attributes between participants and introduces a supervisory mechanism to align participants' characters and behaviors as well as addressing distractions. A Multi-stage Interactive Legal Evaluation (MILE) benchmark is further constructed to evaluate LLMs' performance in dynamic legal scenarios. Extensive experiments confirm the effectiveness of our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06882v1-abstract-full').style.display = 'none'; document.getElementById('2502.06882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05870">arXiv:2502.05870</a> <span> [<a href="https://arxiv.org/pdf/2502.05870">pdf</a>, <a href="https://arxiv.org/format/2502.05870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Understanding Design Fixation in Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuqing Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yaxuan Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Jing%2C+Q">Qianzhi Jing</a>, <a href="/search/cs?searchtype=author&query=Hansen%2C+P">Preben Hansen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05870v1-abstract-short" style="display: inline;"> Generative AI (GenAI) provides new opportunities for creativity support, but the phenomenon of GenAI design fixation remains underexplored. While human design fixation typically constrains ideas to familiar or existing solutions, our findings reveal that GenAI similarly experience design fixation, limiting its ability to generate novel and diverse design outcomes. To advance understanding of GenAI… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05870v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05870v1-abstract-full" style="display: none;"> Generative AI (GenAI) provides new opportunities for creativity support, but the phenomenon of GenAI design fixation remains underexplored. While human design fixation typically constrains ideas to familiar or existing solutions, our findings reveal that GenAI similarly experience design fixation, limiting its ability to generate novel and diverse design outcomes. To advance understanding of GenAI design fixation, we propose a theoretical framework includes the definition, causes, manifestations, and impacts of GenAI design fixation for creative design. We also conducted an experimental study to investigate the characteristics of GenAI design fixation in practice. We summarize how GenAI design fixation manifests in text generation model and image generation model respectively. Furthermore, we propose methods for mitigating GenAI design fixation for future creativity support tool design. We recommend adopting the lens of GenAI design fixation for creativity-oriented HCI research, as the unique perspectives and insights it provides. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05870v1-abstract-full').style.display = 'none'; document.getElementById('2502.05870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02810">arXiv:2502.02810</a> <span> [<a href="https://arxiv.org/pdf/2502.02810">pdf</a>, <a href="https://arxiv.org/format/2502.02810">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Mol-LLM: Generalist Molecular LLM with Improved Graph Utilization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chanhui Lee</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuheon Song</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+Y">YongJun Jeong</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+H">Hanbum Ko</a>, <a href="/search/cs?searchtype=author&query=Hormazabal%2C+R">Rodrigo Hormazabal</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Sehui Han</a>, <a href="/search/cs?searchtype=author&query=Bae%2C+K">Kyunghoon Bae</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+S">Sungbin Lim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungwoong Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02810v1-abstract-short" style="display: inline;"> Recent advances in Large Language Models (LLMs) have motivated the development of general LLMs for molecular tasks. While several studies have demonstrated that fine-tuned LLMs can achieve impressive benchmark performances, they are far from genuine generalist molecular LLMs due to a lack of fundamental understanding of molecular structure. Specifically, when given molecular task instructions, LLM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02810v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02810v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02810v1-abstract-full" style="display: none;"> Recent advances in Large Language Models (LLMs) have motivated the development of general LLMs for molecular tasks. While several studies have demonstrated that fine-tuned LLMs can achieve impressive benchmark performances, they are far from genuine generalist molecular LLMs due to a lack of fundamental understanding of molecular structure. Specifically, when given molecular task instructions, LLMs trained with naive next-token prediction training assign similar likelihood scores to both original and negatively corrupted molecules, revealing their lack of molecular structure understanding that is crucial for reliable and general molecular LLMs. To overcome this limitation and obtain a true generalist molecular LLM, we introduce a novel multi-modal training method based on a thorough multi-modal instruction tuning as well as a molecular structure preference optimization between chosen and rejected graphs. On various molecular benchmarks, the proposed generalist molecular LLM, called Mol-LLM, achieves state-of-the-art performances among generalist LLMs on most tasks, at the same time, surpassing or comparable to state-of-the-art specialist LLMs. Moreover, Mol-LLM also shows superior generalization performances in reaction prediction tasks, demonstrating the effect of the molecular structure understanding for generalization perspective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02810v1-abstract-full').style.display = 'none'; document.getElementById('2502.02810v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02016">arXiv:2502.02016</a> <span> [<a href="https://arxiv.org/pdf/2502.02016">pdf</a>, <a href="https://arxiv.org/format/2502.02016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Periodic Bayesian Flow for Material Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+H">Hanlin Wu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuxuan Song</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jingjing Gong</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Ziyao Cao</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+Y">Yawen Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianbing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wei-Ying Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingjing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02016v1-abstract-short" style="display: inline;"> Generative modeling of crystal data distribution is an important yet challenging task due to the unique periodic physical symmetry of crystals. Diffusion-based methods have shown early promise in modeling crystal distribution. More recently, Bayesian Flow Networks were introduced to aggregate noisy latent variables, resulting in a variance-reduced parameter space that has been shown to be advantag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02016v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02016v1-abstract-full" style="display: none;"> Generative modeling of crystal data distribution is an important yet challenging task due to the unique periodic physical symmetry of crystals. Diffusion-based methods have shown early promise in modeling crystal distribution. More recently, Bayesian Flow Networks were introduced to aggregate noisy latent variables, resulting in a variance-reduced parameter space that has been shown to be advantageous for modeling Euclidean data distributions with structural constraints (Song et al., 2023). Inspired by this, we seek to unlock its potential for modeling variables located in non-Euclidean manifolds e.g. those within crystal structures, by overcoming challenging theoretical issues. We introduce CrysBFN, a novel crystal generation method by proposing a periodic Bayesian flow, which essentially differs from the original Gaussian-based BFN by exhibiting non-monotonic entropy dynamics. To successfully realize the concept of periodic Bayesian flow, CrysBFN integrates a new entropy conditioning mechanism and empirically demonstrates its significance compared to time-conditioning. Extensive experiments over both crystal ab initio generation and crystal structure prediction tasks demonstrate the superiority of CrysBFN, which consistently achieves new state-of-the-art on all benchmarks. Surprisingly, we found that CrysBFN enjoys a significant improvement in sampling efficiency, e.g., ~100x speedup 10 v.s. 2000 steps network forwards) compared with previous diffusion-based methods on MP-20 dataset. Code is available at https://github.com/wu-han-lin/CrysBFN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02016v1-abstract-full').style.display = 'none'; document.getElementById('2502.02016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01627">arXiv:2502.01627</a> <span> [<a href="https://arxiv.org/pdf/2502.01627">pdf</a>, <a href="https://arxiv.org/format/2502.01627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Astrophysical Phenomena">astro-ph.HE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> A Poisson Process AutoDecoder for X-ray Sources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yanke Song</a>, <a href="/search/cs?searchtype=author&query=Villar%2C+V+A">Victoria Ashley Villar</a>, <a href="/search/cs?searchtype=author&query=Martinez-Galarza%2C+J+R">Juan Rafael Martinez-Galarza</a>, <a href="/search/cs?searchtype=author&query=Dillmann%2C+S">Steven Dillmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01627v2-abstract-short" style="display: inline;"> X-ray observing facilities, such as the Chandra X-ray Observatory and the eROSITA, have detected millions of astronomical sources associated with high-energy phenomena. The arrival of photons as a function of time follows a Poisson process and can vary by orders-of-magnitude, presenting obstacles for common tasks such as source classification, physical property derivation, and anomaly detection. P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01627v2-abstract-full').style.display = 'inline'; document.getElementById('2502.01627v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01627v2-abstract-full" style="display: none;"> X-ray observing facilities, such as the Chandra X-ray Observatory and the eROSITA, have detected millions of astronomical sources associated with high-energy phenomena. The arrival of photons as a function of time follows a Poisson process and can vary by orders-of-magnitude, presenting obstacles for common tasks such as source classification, physical property derivation, and anomaly detection. Previous work has either failed to directly capture the Poisson nature of the data or only focuses on Poisson rate function reconstruction. In this work, we present Poisson Process AutoDecoder (PPAD). PPAD is a neural field decoder that maps fixed-length latent features to continuous Poisson rate functions across energy band and time via unsupervised learning. PPAD reconstructs the rate function and yields a representation at the same time. We demonstrate the efficacy of PPAD via reconstruction, regression, classification and anomaly detection experiments using the Chandra Source Catalog. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01627v2-abstract-full').style.display = 'none'; document.getElementById('2502.01627v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01572">arXiv:2502.01572</a> <span> [<a href="https://arxiv.org/pdf/2502.01572">pdf</a>, <a href="https://arxiv.org/format/2502.01572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MakeAnything: Harnessing Diffusion Transformers for Multi-Domain Procedural Sequence Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiren Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01572v2-abstract-short" style="display: inline;"> A hallmark of human intelligence is the ability to create complex artifacts through structured multi-step processes. Generating procedural tutorials with AI is a longstanding but challenging goal, facing three key obstacles: (1) scarcity of multi-task procedural datasets, (2) maintaining logical continuity and visual consistency between steps, and (3) generalizing across multiple domains. To addre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01572v2-abstract-full').style.display = 'inline'; document.getElementById('2502.01572v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01572v2-abstract-full" style="display: none;"> A hallmark of human intelligence is the ability to create complex artifacts through structured multi-step processes. Generating procedural tutorials with AI is a longstanding but challenging goal, facing three key obstacles: (1) scarcity of multi-task procedural datasets, (2) maintaining logical continuity and visual consistency between steps, and (3) generalizing across multiple domains. To address these challenges, we propose a multi-domain dataset covering 21 tasks with over 24,000 procedural sequences. Building upon this foundation, we introduce MakeAnything, a framework based on the diffusion transformer (DIT), which leverages fine-tuning to activate the in-context capabilities of DIT for generating consistent procedural sequences. We introduce asymmetric low-rank adaptation (LoRA) for image generation, which balances generalization capabilities and task-specific performance by freezing encoder parameters while adaptively tuning decoder layers. Additionally, our ReCraft model enables image-to-process generation through spatiotemporal consistency constraints, allowing static images to be decomposed into plausible creation sequences. Extensive experiments demonstrate that MakeAnything surpasses existing methods, setting new performance benchmarks for procedural generation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01572v2-abstract-full').style.display = 'none'; document.getElementById('2502.01572v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01105">arXiv:2502.01105</a> <span> [<a href="https://arxiv.org/pdf/2502.01105">pdf</a>, <a href="https://arxiv.org/format/2502.01105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiren Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danze Chen</a>, <a href="/search/cs?searchtype=author&query=Shou%2C+M+Z">Mike Zheng Shou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01105v1-abstract-short" style="display: inline;"> Generating cognitive-aligned layered SVGs remains challenging due to existing methods' tendencies toward either oversimplified single-layer outputs or optimization-induced shape redundancies. We propose LayerTracer, a diffusion transformer based framework that bridges this gap by learning designers' layered SVG creation processes from a novel dataset of sequential design operations. Our approach o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01105v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01105v1-abstract-full" style="display: none;"> Generating cognitive-aligned layered SVGs remains challenging due to existing methods' tendencies toward either oversimplified single-layer outputs or optimization-induced shape redundancies. We propose LayerTracer, a diffusion transformer based framework that bridges this gap by learning designers' layered SVG creation processes from a novel dataset of sequential design operations. Our approach operates in two phases: First, a text-conditioned DiT generates multi-phase rasterized construction blueprints that simulate human design workflows. Second, layer-wise vectorization with path deduplication produces clean, editable SVGs. For image vectorization, we introduce a conditional diffusion mechanism that encodes reference images into latent tokens, guiding hierarchical reconstruction while preserving structural integrity. Extensive experiments demonstrate LayerTracer's superior performance against optimization-based and neural baselines in both generation quality and editability, effectively aligning AI-generated vectors with professional design cognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01105v1-abstract-full').style.display = 'none'; document.getElementById('2502.01105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00366">arXiv:2502.00366</a> <span> [<a href="https://arxiv.org/pdf/2502.00366">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prostate-Specific Foundation Models for Enhanced Detection of Clinically Significant Cancer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J+H">Jeong Hoon Lee</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C+X">Cynthia Xinran Li</a>, <a href="/search/cs?searchtype=author&query=Jahanandish%2C+H">Hassan Jahanandish</a>, <a href="/search/cs?searchtype=author&query=Bhattacharya%2C+I">Indrani Bhattacharya</a>, <a href="/search/cs?searchtype=author&query=Vesal%2C+S">Sulaiman Vesal</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lichun Zhang</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+S">Shengtian Sang</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M+H">Moon Hyung Choi</a>, <a href="/search/cs?searchtype=author&query=Soerensen%2C+S+J+C">Simon John Christoph Soerensen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S+R">Steve Ran Zhou</a>, <a href="/search/cs?searchtype=author&query=Sommer%2C+E+R">Elijah Richard Sommer</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+R">Richard Fan</a>, <a href="/search/cs?searchtype=author&query=Ghanouni%2C+P">Pejman Ghanouni</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuze Song</a>, <a href="/search/cs?searchtype=author&query=Seibert%2C+T+M">Tyler M. Seibert</a>, <a href="/search/cs?searchtype=author&query=Sonn%2C+G+A">Geoffrey A. Sonn</a>, <a href="/search/cs?searchtype=author&query=Rusu%2C+M">Mirabela Rusu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00366v2-abstract-short" style="display: inline;"> Accurate prostate cancer diagnosis remains challenging. Even when using MRI, radiologists exhibit low specificity and significant inter-observer variability, leading to potential delays or inaccuracies in identifying clinically significant cancers. This leads to numerous unnecessary biopsies and risks of missing clinically significant cancers. Here we present prostate vision contrastive network (P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00366v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00366v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00366v2-abstract-full" style="display: none;"> Accurate prostate cancer diagnosis remains challenging. Even when using MRI, radiologists exhibit low specificity and significant inter-observer variability, leading to potential delays or inaccuracies in identifying clinically significant cancers. This leads to numerous unnecessary biopsies and risks of missing clinically significant cancers. Here we present prostate vision contrastive network (ProViCNet), prostate organ-specific vision foundation models for Magnetic Resonance Imaging (MRI) and Trans-Rectal Ultrasound imaging (TRUS) for comprehensive cancer detection. ProViCNet was trained and validated using 4,401 patients across six institutions, as a prostate cancer detection model on radiology images relying on patch-level contrastive learning guided by biopsy confirmed radiologist annotations. ProViCNet demonstrated consistent performance across multiple internal and external validation cohorts with area under the receiver operating curve values ranging from 0.875 to 0.966, significantly outperforming radiologists in the reader study (0.907 versus 0.805, p<0.001) for mpMRI, while achieving 0.670 to 0.740 for TRUS. We also integrated ProViCNet with standard PSA to develop a virtual screening test, and we showed that we can maintain the high sensitivity for detecting clinically significant cancers while more than doubling specificity from 15% to 38% (p<0.001), thereby substantially reducing unnecessary biopsies. These findings highlight that ProViCNet's potential for enhancing prostate cancer diagnosis accuracy and reduce unnecessary biopsies, thereby optimizing diagnostic pathways. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00366v2-abstract-full').style.display = 'none'; document.getElementById('2502.00366v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">44pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00283">arXiv:2502.00283</a> <span> [<a href="https://arxiv.org/pdf/2502.00283">pdf</a>, <a href="https://arxiv.org/format/2502.00283">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> How Generative AI supports human in conceptual design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuging Chen</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yaxuan Song</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jia Guo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a>, <a href="/search/cs?searchtype=author&query=Childs%2C+P">Peter Childs</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yuan Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00283v1-abstract-short" style="display: inline;"> Generative Artificial Intelligence (Generative AI) is a collection of AI technologies that can generate new information such as texts and images. With its strong capabilities, Generative AI has been actively studied in creative design processes. However, limited studies have explored the roles of humans and Generative AI in conceptual design processes, leaving a gap for human-AI collaboration inve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00283v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00283v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00283v1-abstract-full" style="display: none;"> Generative Artificial Intelligence (Generative AI) is a collection of AI technologies that can generate new information such as texts and images. With its strong capabilities, Generative AI has been actively studied in creative design processes. However, limited studies have explored the roles of humans and Generative AI in conceptual design processes, leaving a gap for human-AI collaboration investigation. To address this gap, this study uncovers the contributions of different Generative AI technologies in assisting humans in the conceptual design process. Novice designers completed two design tasks with or without the assistance of Generative AI. Results revealed that Generative AI primarily assists humans in problem definition and idea generation stages, while idea selection and evaluation remain predominantly human-led. Additionally, with Generative AI assistance, the idea selection and evaluation stages were further enhanced. Based on the findings, we discuss the role of Generative AI in human-AI collaboration and implications for enhancing future conceptual design support with Generative AI assistance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00283v1-abstract-full').style.display = 'none'; document.getElementById('2502.00283v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 2 figures, accepted by Design Science</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19034">arXiv:2501.19034</a> <span> [<a href="https://arxiv.org/pdf/2501.19034">pdf</a>, <a href="https://arxiv.org/format/2501.19034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> XRF V2: A Dataset for Action Summarization with Wi-Fi Signals, and IMUs in Phones, Watches, Earbuds, and Glasses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+B">Bo Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pei Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jiaxi Yin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunpeng Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinsong Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19034v1-abstract-short" style="display: inline;"> Human Action Recognition (HAR) plays a crucial role in applications such as health monitoring, smart home automation, and human-computer interaction. While HAR has been extensively studied, action summarization, which involves identifying and summarizing continuous actions, remains an emerging task. This paper introduces the novel XRF V2 dataset, designed for indoor daily activity Temporal Action… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19034v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19034v1-abstract-full" style="display: none;"> Human Action Recognition (HAR) plays a crucial role in applications such as health monitoring, smart home automation, and human-computer interaction. While HAR has been extensively studied, action summarization, which involves identifying and summarizing continuous actions, remains an emerging task. This paper introduces the novel XRF V2 dataset, designed for indoor daily activity Temporal Action Localization (TAL) and action summarization. XRF V2 integrates multimodal data from Wi-Fi signals, IMU sensors (smartphones, smartwatches, headphones, and smart glasses), and synchronized video recordings, offering a diverse collection of indoor activities from 16 volunteers across three distinct environments. To tackle TAL and action summarization, we propose the XRFMamba neural network, which excels at capturing long-term dependencies in untrimmed sensory sequences and outperforms state-of-the-art methods, such as ActionFormer and WiFiTAD. We envision XRF V2 as a valuable resource for advancing research in human action localization, action forecasting, pose estimation, multimodal foundation models pre-training, synthetic data generation, and more. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19034v1-abstract-full').style.display = 'none'; document.getElementById('2501.19034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 11 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18170">arXiv:2501.18170</a> <span> [<a href="https://arxiv.org/pdf/2501.18170">pdf</a>, <a href="https://arxiv.org/format/2501.18170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continually Evolved Multimodal Foundation Models for Cancer Prognosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jie Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuang Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Longwei Yang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiran Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kaixiong Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+F">Feng Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Mingquan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18170v2-abstract-short" style="display: inline;"> Cancer prognosis is a critical task that involves predicting patient outcomes and survival rates. To enhance prediction accuracy, previous studies have integrated diverse data modalities, such as clinical notes, medical images, and genomic data, leveraging their complementary information. However, existing approaches face two major limitations. First, they struggle to incorporate newly arrived dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18170v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18170v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18170v2-abstract-full" style="display: none;"> Cancer prognosis is a critical task that involves predicting patient outcomes and survival rates. To enhance prediction accuracy, previous studies have integrated diverse data modalities, such as clinical notes, medical images, and genomic data, leveraging their complementary information. However, existing approaches face two major limitations. First, they struggle to incorporate newly arrived data with varying distributions into training, such as patient records from different hospitals, thus rendering sub-optimal generalizability and limited utility in real-world applications. Second, most multimodal integration methods rely on simplistic concatenation or task-specific pipelines, which fail to capture the complex interdependencies across modalities. To address these, we propose a continually evolving multi-modal foundation model. Extensive experiments on the TCGA dataset demonstrate the effectiveness of our approach, highlighting its potential to advance cancer prognosis by enabling robust and adaptive multimodal integration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18170v2-abstract-full').style.display = 'none'; document.getElementById('2501.18170v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figure</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> I.2.7; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18102">arXiv:2501.18102</a> <span> [<a href="https://arxiv.org/pdf/2501.18102">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Security for IEEE P1451.1.6-based Sensor Networks for IoT Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nishi%2C+H">Hiroaki Nishi</a>, <a href="/search/cs?searchtype=author&query=Wijekoon%2C+J">Janaka Wijekoon</a>, <a href="/search/cs?searchtype=author&query=Song%2C+E+Y">Eugene Y. Song</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K+B">Kang B. Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18102v1-abstract-short" style="display: inline;"> There are many challenges for Internet of Things (IoT) sensor networks including the lack of robust standards, diverse wireline and wireless connectivity, interoperability, security, and privacy. Addressing these challenges, the Institute of Electrical and Electronics Engineers (IEEE) P1451.0 standard defines network services, transducer services, transducer electronic data sheets (TEDS) format, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18102v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18102v1-abstract-full" style="display: none;"> There are many challenges for Internet of Things (IoT) sensor networks including the lack of robust standards, diverse wireline and wireless connectivity, interoperability, security, and privacy. Addressing these challenges, the Institute of Electrical and Electronics Engineers (IEEE) P1451.0 standard defines network services, transducer services, transducer electronic data sheets (TEDS) format, and a security framework to achieve sensor data security and interoperability for IoT applications. This paper proposes a security solution for IEEE P1451.1.6-based sensor networks for IoT applications utilizing the security framework defined in IEEE P1451.0. The proposed solution includes an architecture, a security policy with six security levels, security standards, and security TEDS. Further, this paper introduces a new service to update access control lists (ACLs) to regulate the access for topic names by the applications and provides an implementation of the security TEDS for IEEE P1451.1.6-based sensor networks. The paper also illustrates how to access security TEDS that contain metadata on security standards to achieve sensor data security and interoperability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18102v1-abstract-full').style.display = 'none'; document.getElementById('2501.18102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17851">arXiv:2501.17851</a> <span> [<a href="https://arxiv.org/pdf/2501.17851">pdf</a>, <a href="https://arxiv.org/format/2501.17851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> UGSim: Autonomous Buoyancy-Driven Underwater Glider Simulator with LQR Control Strategy and Recursive Guidance System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhizun Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yang Song</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiabao Zhu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weichao Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17851v1-abstract-short" style="display: inline;"> This paper presents the UGSim, a simulator for buoyancy-driven gliders, with a LQR control strategy, and a recursive guidance system. Building on the top of the DAVE and the UUVsim, it is designed to address unique challenges that come from the complex hydrodynamic and hydrostatic impacts on buoyancy-driven gliders, which conventional robotics simulators can't deal with. Since distinguishing featu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17851v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17851v1-abstract-full" style="display: none;"> This paper presents the UGSim, a simulator for buoyancy-driven gliders, with a LQR control strategy, and a recursive guidance system. Building on the top of the DAVE and the UUVsim, it is designed to address unique challenges that come from the complex hydrodynamic and hydrostatic impacts on buoyancy-driven gliders, which conventional robotics simulators can't deal with. Since distinguishing features of the class of vehicles, general controllers and guidance systems developed for underwater robotics are infeasible. The simulator is provided to accelerate the development and the evaluation of algorithms that would otherwise require expensive and time-consuming operations at sea. It consists of a basic kinetic module, a LQR control module and a recursive guidance module, which allows the user to concentrate on the single problem rather than the whole robotics system and the software infrastructure. We demonstrate the usage of the simulator through an example, loading the configuration of the buoyancy-driven glider named Petrel-II, presenting its dynamics simulation, performances of the control strategy and the guidance system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17851v1-abstract-full').style.display = 'none'; document.getElementById('2501.17851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17799">arXiv:2501.17799</a> <span> [<a href="https://arxiv.org/pdf/2501.17799">pdf</a>, <a href="https://arxiv.org/format/2501.17799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3714213">10.1145/3706598.3714213 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Leveraging Multimodal LLM for Inspirational User Interface Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Park%2C+S">Seokhyeon Park</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yumin Song</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Soohyun Lee</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jaeyoung Kim</a>, <a href="/search/cs?searchtype=author&query=Seo%2C+J">Jinwook Seo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17799v3-abstract-short" style="display: inline;"> Inspirational search, the process of exploring designs to inform and inspire new creative work, is pivotal in mobile user interface (UI) design. However, exploring the vast space of UI references remains a challenge. Existing AI-based UI search methods often miss crucial semantics like target users or the mood of apps. Additionally, these models typically require metadata like view hierarchies, li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17799v3-abstract-full').style.display = 'inline'; document.getElementById('2501.17799v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17799v3-abstract-full" style="display: none;"> Inspirational search, the process of exploring designs to inform and inspire new creative work, is pivotal in mobile user interface (UI) design. However, exploring the vast space of UI references remains a challenge. Existing AI-based UI search methods often miss crucial semantics like target users or the mood of apps. Additionally, these models typically require metadata like view hierarchies, limiting their practical use. We used a multimodal large language model (MLLM) to extract and interpret semantics from mobile UI images. We identified key UI semantics through a formative study and developed a semantic-based UI search system. Through computational and human evaluations, we demonstrate that our approach significantly outperforms existing UI retrieval methods, offering UI designers a more enriched and contextually relevant search experience. We enhance the understanding of mobile UI design semantics and highlight MLLMs' potential in inspirational search, providing a rich dataset of UI semantics for future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17799v3-abstract-full').style.display = 'none'; document.getElementById('2501.17799v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In Proceedings of the SIGCHI Conference on Human Factors in Computing Systems (CHI '25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16022">arXiv:2501.16022</a> <span> [<a href="https://arxiv.org/pdf/2501.16022">pdf</a>, <a href="https://arxiv.org/format/2501.16022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Freestyle Sketch-in-the-Loop Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Koley%2C+S">Subhadeep Koley</a>, <a href="/search/cs?searchtype=author&query=Gajjala%2C+V+R">Viswanatha Reddy Gajjala</a>, <a href="/search/cs?searchtype=author&query=Sain%2C+A">Aneeshan Sain</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+P+N">Pinaki Nath Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+T">Tao Xiang</a>, <a href="/search/cs?searchtype=author&query=Bhunia%2C+A+K">Ayan Kumar Bhunia</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yi-Zhe Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16022v1-abstract-short" style="display: inline;"> In this paper, we expand the domain of sketch research into the field of image segmentation, aiming to establish freehand sketches as a query modality for subjective image segmentation. Our innovative approach introduces a "sketch-in-the-loop" image segmentation framework, enabling the segmentation of visual concepts partially, completely, or in groupings - a truly "freestyle" approach - without t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16022v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16022v1-abstract-full" style="display: none;"> In this paper, we expand the domain of sketch research into the field of image segmentation, aiming to establish freehand sketches as a query modality for subjective image segmentation. Our innovative approach introduces a "sketch-in-the-loop" image segmentation framework, enabling the segmentation of visual concepts partially, completely, or in groupings - a truly "freestyle" approach - without the need for a purpose-made dataset (i.e., mask-free). This framework capitalises on the synergy between sketch-based image retrieval (SBIR) models and large-scale pre-trained models (CLIP or DINOv2). The former provides an effective training signal, while fine-tuned versions of the latter execute the subjective segmentation. Additionally, our purpose-made augmentation strategy enhances the versatility of our sketch-guided mask generation, allowing segmentation at multiple granularity levels. Extensive evaluations across diverse benchmark datasets underscore the superior performance of our method in comparison to existing approaches across various evaluation scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16022v1-abstract-full').style.display = 'none'; document.getElementById('2501.16022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15891">arXiv:2501.15891</a> <span> [<a href="https://arxiv.org/pdf/2501.15891">pdf</a>, <a href="https://arxiv.org/format/2501.15891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Any2AnyTryon: Leveraging Adaptive Position Embeddings for Versatile Virtual Clothing Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hailong Guo</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bohan Zeng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiren Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wentao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuang Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15891v1-abstract-short" style="display: inline;"> Image-based virtual try-on (VTON) aims to generate a virtual try-on result by transferring an input garment onto a target person's image. However, the scarcity of paired garment-model data makes it challenging for existing methods to achieve high generalization and quality in VTON. Also, it limits the ability to generate mask-free try-ons. To tackle the data scarcity problem, approaches such as St… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15891v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15891v1-abstract-full" style="display: none;"> Image-based virtual try-on (VTON) aims to generate a virtual try-on result by transferring an input garment onto a target person's image. However, the scarcity of paired garment-model data makes it challenging for existing methods to achieve high generalization and quality in VTON. Also, it limits the ability to generate mask-free try-ons. To tackle the data scarcity problem, approaches such as Stable Garment and MMTryon use a synthetic data strategy, effectively increasing the amount of paired data on the model side. However, existing methods are typically limited to performing specific try-on tasks and lack user-friendliness. To enhance the generalization and controllability of VTON generation, we propose Any2AnyTryon, which can generate try-on results based on different textual instructions and model garment images to meet various needs, eliminating the reliance on masks, poses, or other conditions. Specifically, we first construct the virtual try-on dataset LAION-Garment, the largest known open-source garment try-on dataset. Then, we introduce adaptive position embedding, which enables the model to generate satisfactory outfitted model images or garment images based on input images of different sizes and categories, significantly enhancing the generalization and controllability of VTON generation. In our experiments, we demonstrate the effectiveness of our Any2AnyTryon and compare it with existing methods. The results show that Any2AnyTryon enables flexible, controllable, and high-quality image-based virtual try-on generation.https://logn-2024.github.io/Any2anyTryonProjectPage/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15891v1-abstract-full').style.display = 'none'; document.getElementById('2501.15891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14224">arXiv:2501.14224</a> <span> [<a href="https://arxiv.org/pdf/2501.14224">pdf</a>, <a href="https://arxiv.org/format/2501.14224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Top Ten Challenges Towards Agentic Neural Graph Databases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jiaxin Bai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yukun Zhou</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hang Yin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+W">Weizhi Fei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qi Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zheye Deng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiayang Cheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianshi Zheng</a>, <a href="/search/cs?searchtype=author&query=Tsang%2C+H+T">Hong Ting Tsang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yisen Gao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhongwei Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yufei Li</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+L">Lixin Fan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Binhang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaofang Zhou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14224v1-abstract-short" style="display: inline;"> Graph databases (GDBs) like Neo4j and TigerGraph excel at handling interconnected data but lack advanced inference capabilities. Neural Graph Databases (NGDBs) address this by integrating Graph Neural Networks (GNNs) for predictive analysis and reasoning over incomplete or noisy data. However, NGDBs rely on predefined queries and lack autonomy and adaptability. This paper introduces Agentic Neural… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14224v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14224v1-abstract-full" style="display: none;"> Graph databases (GDBs) like Neo4j and TigerGraph excel at handling interconnected data but lack advanced inference capabilities. Neural Graph Databases (NGDBs) address this by integrating Graph Neural Networks (GNNs) for predictive analysis and reasoning over incomplete or noisy data. However, NGDBs rely on predefined queries and lack autonomy and adaptability. This paper introduces Agentic Neural Graph Databases (Agentic NGDBs), which extend NGDBs with three core functionalities: autonomous query construction, neural query execution, and continuous learning. We identify ten key challenges in realizing Agentic NGDBs: semantic unit representation, abductive reasoning, scalable query execution, and integration with foundation models like large language models (LLMs). By addressing these challenges, Agentic NGDBs can enable intelligent, self-improving systems for modern data-driven applications, paving the way for adaptable and autonomous data management solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14224v1-abstract-full').style.display = 'none'; document.getElementById('2501.14224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13985">arXiv:2501.13985</a> <span> [<a href="https://arxiv.org/pdf/2501.13985">pdf</a>, <a href="https://arxiv.org/format/2501.13985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Pilot: Building the Federated Multimodal Instruction Tuning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiong%2C+B">Baochen Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoshan Yang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yaguang Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaowei Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Changsheng Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13985v1-abstract-short" style="display: inline;"> In this paper, we explore a novel federated multimodal instruction tuning task(FedMIT), which is significant for collaboratively fine-tuning MLLMs on different types of multimodal instruction data on distributed devices. To solve the new task, we propose a federated multimodal instruction tuning framework(Pilot). Our framework integrates two stages of "adapter on adapter" into the connector of the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13985v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13985v1-abstract-full" style="display: none;"> In this paper, we explore a novel federated multimodal instruction tuning task(FedMIT), which is significant for collaboratively fine-tuning MLLMs on different types of multimodal instruction data on distributed devices. To solve the new task, we propose a federated multimodal instruction tuning framework(Pilot). Our framework integrates two stages of "adapter on adapter" into the connector of the vision encoder and the LLM. In stage 1, we extract task-specific features and client-specific features from visual information. In stage 2, we build the cross-task Mixture-of-Adapters(CT-MoA) module to perform cross-task interaction. Each client can not only capture personalized information of local data and learn task-related multimodal information, but also learn general knowledge from other tasks. In addition, we introduce an adaptive parameter aggregation strategy for text training parameters, which optimizes parameter aggregation by calculating weights based on the euclidean distance between parameters, so that parameter aggregation can benefit from positive effects to the greatest extent while effectively reducing negative effects. Our framework can collaboratively exploit distributed data from different local clients to learn cross-task knowledge without being affected by the task heterogeneity during instruction tuning. The effectiveness of our method is verified in two different cross-task scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13985v1-abstract-full').style.display = 'none'; document.getElementById('2501.13985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13805">arXiv:2501.13805</a> <span> [<a href="https://arxiv.org/pdf/2501.13805">pdf</a>, <a href="https://arxiv.org/format/2501.13805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EgoHand: Ego-centric Hand Pose Estimation and Gesture Recognition with Head-mounted Millimeter-wave Radar and IMUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yizhe Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingting Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunpeng Song</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinsong Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13805v1-abstract-short" style="display: inline;"> Recent advanced Virtual Reality (VR) headsets, such as the Apple Vision Pro, employ bottom-facing cameras to detect hand gestures and inputs, which offers users significant convenience in VR interactions. However, these bottom-facing cameras can sometimes be inconvenient and pose a risk of unintentionally exposing sensitive information, such as private body parts or personal surroundings. To mitig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13805v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13805v1-abstract-full" style="display: none;"> Recent advanced Virtual Reality (VR) headsets, such as the Apple Vision Pro, employ bottom-facing cameras to detect hand gestures and inputs, which offers users significant convenience in VR interactions. However, these bottom-facing cameras can sometimes be inconvenient and pose a risk of unintentionally exposing sensitive information, such as private body parts or personal surroundings. To mitigate these issues, we introduce EgoHand. This system provides an alternative solution by integrating millimeter-wave radar and IMUs for hand gesture recognition, thereby offering users an additional option for gesture interaction that enhances privacy protection. To accurately recognize hand gestures, we devise a two-stage skeleton-based gesture recognition scheme. In the first stage, a novel end-to-end Transformer architecture is employed to estimate the coordinates of hand joints. Subsequently, these estimated joint coordinates are utilized for gesture recognition. Extensive experiments involving 10 subjects show that EgoHand can detect hand gestures with 90.8% accuracy. Furthermore, EgoHand demonstrates robust performance across a variety of cross-domain tests, including different users, dominant hands, body postures, and scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13805v1-abstract-full').style.display = 'none'; document.getElementById('2501.13805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13475">arXiv:2501.13475</a> <span> [<a href="https://arxiv.org/pdf/2501.13475">pdf</a>, <a href="https://arxiv.org/format/2501.13475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LDR-Net: A Novel Framework for AI-generated Image Detection via Localized Discrepancy Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">JiaXin Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Miao Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">DengYong Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yun Song</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+X">Xin Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13475v1-abstract-short" style="display: inline;"> With the rapid advancement of generative models, the visual quality of generated images has become nearly indistinguishable from the real ones, posing challenges to content authenticity verification. Existing methods for detecting AI-generated images primarily focus on specific forgery clues, which are often tailored to particular generative models like GANs or diffusion models. These approaches s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13475v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13475v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13475v1-abstract-full" style="display: none;"> With the rapid advancement of generative models, the visual quality of generated images has become nearly indistinguishable from the real ones, posing challenges to content authenticity verification. Existing methods for detecting AI-generated images primarily focus on specific forgery clues, which are often tailored to particular generative models like GANs or diffusion models. These approaches struggle to generalize across architectures. Building on the observation that generative images often exhibit local anomalies, such as excessive smoothness, blurred textures, and unnatural pixel variations in small regions, we propose the localized discrepancy representation network (LDR-Net), a novel approach for detecting AI-generated images. LDR-Net captures smoothing artifacts and texture irregularities, which are common but often overlooked. It integrates two complementary modules: local gradient autocorrelation (LGA) which models local smoothing anomalies to detect smoothing anomalies, and local variation pattern (LVP) which captures unnatural regularities by modeling the complexity of image patterns. By merging LGA and LVP features, a comprehensive representation of localized discrepancies can be provided. Extensive experiments demonstrate that our LDR-Net achieves state-of-the-art performance in detecting generated images and exhibits satisfactory generalization across unseen generative models. The code will be released upon acceptance of this paper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13475v1-abstract-full').style.display = 'none'; document.getElementById('2501.13475v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13354">arXiv:2501.13354</a> <span> [<a href="https://arxiv.org/pdf/2501.13354">pdf</a>, <a href="https://arxiv.org/format/2501.13354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NUDT4MSTAR: A Large Dataset and Benchmark Towards Remote Sensing Object Recognition in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongxiang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijie Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xuying Xiong</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Bowen Peng</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yafei Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianpeng Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13354v2-abstract-short" style="display: inline;"> As an indispensable sensor for Remote sensing, Synthetic Aperture Radar (SAR) has a unique capability for all-day imaging. Nevertheless, in a data-driven era, the scarcity of large-scale datasets poses a significant bottleneck to advancing SAR automatic target recognition (ATR) technology. This paper introduces NUDT4MSTAR, a large-scale SAR dataset for remote sensing target recognition in the wild… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13354v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13354v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13354v2-abstract-full" style="display: none;"> As an indispensable sensor for Remote sensing, Synthetic Aperture Radar (SAR) has a unique capability for all-day imaging. Nevertheless, in a data-driven era, the scarcity of large-scale datasets poses a significant bottleneck to advancing SAR automatic target recognition (ATR) technology. This paper introduces NUDT4MSTAR, a large-scale SAR dataset for remote sensing target recognition in the wild, including 40 vehicle target types and various imaging conditions across 5 realistic scenes. NUDT4MSTAR represents a significant leap forward in dataset scale, containing over 190,000 images-tenfold the size of its predecessors. We meticulously annotate each image with detailed target information and imaging conditions. Besides, data in both processed magnitude images and original complex formats are provided. Then, we construct a comprehensive benchmark consisting of 7 experiments with 15 recognition methods focusing on the stable and effective ATR issues. Besides, we conduct transfer learning experiments utilizing various models training on NUDT4MSTAR and apply them to three other target datasets, demonstrating its substantial potential for the broader field of ground objects ATR. Finally, we discuss this dataset's application value and ATR's significant challenges. To the best of our knowledge, this work marks the first-ever endeavor to create a large-scale dataset benchmark for fine-grained SAR recognition in the wild, featuring an extensive collection of exhaustively annotated vehicle images. We expect that the open source of NUDT4MSTAR will facilitate the development of SAR ATR and attract a wider community of researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13354v2-abstract-full').style.display = 'none'; document.getElementById('2501.13354v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 14 figures; NUDT4MSTAR: https://github.com/waterdisappear/NUDT4MSTAR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12492">arXiv:2501.12492</a> <span> [<a href="https://arxiv.org/pdf/2501.12492">pdf</a>, <a href="https://arxiv.org/ps/2501.12492">ps</a>, <a href="https://arxiv.org/format/2501.12492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> QuSplit: Achieving Both High Fidelity and Throughput via Job Splitting on Noisy Quantum Computers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyang Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuhong Song</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yipei Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jianli Pan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lei Yang</a>, <a href="/search/cs?searchtype=author&query=Humble%2C+T">Travis Humble</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weiwen Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12492v1-abstract-short" style="display: inline;"> As we enter the quantum utility era, the computing paradigm shifts toward quantum-centric computing, where multiple quantum processors collaborate with classical computers, exemplified by platforms like IBM Quantum and Amazon Braket. In this paradigm, efficient resource management is crucial; however, unlike classical computing, quantum processors face significant challenges due to noise, which ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12492v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12492v1-abstract-full" style="display: none;"> As we enter the quantum utility era, the computing paradigm shifts toward quantum-centric computing, where multiple quantum processors collaborate with classical computers, exemplified by platforms like IBM Quantum and Amazon Braket. In this paradigm, efficient resource management is crucial; however, unlike classical computing, quantum processors face significant challenges due to noise, which raises fidelity concerns in quantum applications. Compounding this issue, the noise characteristics across different quantum processors are inherently heterogeneous, making resource optimization even more complex. Existing resource management strategies primarily focus on mapping and scheduling jobs to these heterogeneous backends, which leads to some jobs suffering extremely low fidelity. Targeting quantum optimization jobs (e.g., VQC, VQE, QAOA) - one of the most promising quantum applications in the NISQ era, we hypothesize that running the later stages of a job on a high-fidelity quantum processor can significantly enhance overall fidelity. To validate this hypothesis, we use the VQE as a case study and propose a novel and efficient Genetic Algorithm-based scheduling framework with the consideration of job splitting. Experimental results demonstrate that our approach maintains high fidelity across all jobs and significantly improves system throughput. Furthermore, the proposed algorithm shows excellent scalability with respect to the number of quantum processors and the volume of jobs, making it a robust solution for emerging quantum computing platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12492v1-abstract-full').style.display = 'none'; document.getElementById('2501.12492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09686">arXiv:2501.09686</a> <span> [<a href="https://arxiv.org/pdf/2501.09686">pdf</a>, <a href="https://arxiv.org/format/2501.09686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengli Xu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qianyue Hao</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+Z">Zefang Zong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingwei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunke Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyi Wang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+X">Xiaochong Lan</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jiahui Gong</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+T">Tianjian Ouyang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fanjin Meng</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+C">Chenyang Shao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuwei Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qinglong Yang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiwen Song</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Sijian Ren</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jie Feng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09686v3-abstract-short" style="display: inline;"> Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09686v3-abstract-full').style.display = 'inline'; document.getElementById('2501.09686v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09686v3-abstract-full" style="display: none;"> Language has long been conceived as an essential tool for human reasoning. The breakthrough of Large Language Models (LLMs) has sparked significant research interest in leveraging these models to tackle complex reasoning tasks. Researchers have moved beyond simple autoregressive token generation by introducing the concept of "thought" -- a sequence of tokens representing intermediate steps in the reasoning process. This innovative paradigm enables LLMs' to mimic complex human reasoning processes, such as tree search and reflective thinking. Recently, an emerging trend of learning to reason has applied reinforcement learning (RL) to train LLMs to master reasoning processes. This approach enables the automatic generation of high-quality reasoning trajectories through trial-and-error search algorithms, significantly expanding LLMs' reasoning capacity by providing substantially more training data. Furthermore, recent studies demonstrate that encouraging LLMs to "think" with more tokens during test-time inference can further significantly boost reasoning accuracy. Therefore, the train-time and test-time scaling combined to show a new research frontier -- a path toward Large Reasoning Model. The introduction of OpenAI's o1 series marks a significant milestone in this research direction. In this survey, we present a comprehensive review of recent progress in LLM reasoning. We begin by introducing the foundational background of LLMs and then explore the key technical components driving the development of large reasoning models, with a focus on automated data construction, learning-to-reason techniques, and test-time scaling. We also analyze popular open-source projects at building large reasoning models, and conclude with open challenges and future research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09686v3-abstract-full').style.display = 'none'; document.getElementById('2501.09686v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07834">arXiv:2501.07834</a> <span> [<a href="https://arxiv.org/pdf/2501.07834">pdf</a>, <a href="https://arxiv.org/format/2501.07834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Flow: Modularized Agentic Workflow Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+B">Boye Niu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiliao Song</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+K">Kai Lian</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yifan Shen</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yu Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tongliang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07834v2-abstract-short" style="display: inline;"> Multi-agent frameworks powered by large language models (LLMs) have demonstrated great success in automated planning and task execution. However, the effective adjustment of agentic workflows during execution has not been well studied. An effective workflow adjustment is crucial in real-world scenarios, as the initial plan must adjust to unforeseen challenges and changing conditions in real time t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07834v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07834v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07834v2-abstract-full" style="display: none;"> Multi-agent frameworks powered by large language models (LLMs) have demonstrated great success in automated planning and task execution. However, the effective adjustment of agentic workflows during execution has not been well studied. An effective workflow adjustment is crucial in real-world scenarios, as the initial plan must adjust to unforeseen challenges and changing conditions in real time to ensure the efficient execution of complex tasks. In this paper, we define workflows as an activity-on-vertex (AOV) graph, which allows continuous workflow refinement by LLM agents through dynamic subtask allocation adjustment based on historical performance and previous AOVs. To further enhance framework performance, we emphasize modularity in workflow design based on evaluating parallelism and dependency complexity. With this design, our proposed multi-agent framework achieves efficient concurrent execution of subtasks, effective goal achievement, and enhanced error tolerance. Empirical results across various practical tasks demonstrate significant improvements in the efficiency of multi-agent frameworks through dynamic workflow refinement and modularization. The code is available at: https://github.com/tmllab/2025_ICLR_FLOW. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07834v2-abstract-full').style.display = 'none'; document.getElementById('2501.07834v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Song%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Song%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository