CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,042 results for author: <span class="mathjax">Lin, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lin%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lin, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lin%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lin, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09838">arXiv:2502.09838</a> <span> [<a href="https://arxiv.org/pdf/2502.09838">pdf</a>, <a href="https://arxiv.org/format/2502.09838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HealthGPT: A Medical Large Vision-Language Model for Unifying Comprehension and Generation via Heterogeneous Knowledge Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tianwei Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sijing Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Binhe Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoyuan Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wanggui He</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mengze Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaohui Song</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Siliang Tang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yueting Zhuang</a>, <a href="/search/cs?searchtype=author&query=Ooi%2C+B+C">Beng Chin Ooi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09838v1-abstract-short" style="display: inline;"> We present HealthGPT, a powerful Medical Large Vision-Language Model (Med-LVLM) that integrates medical visual comprehension and generation capabilities within a unified autoregressive paradigm. Our bootstrapping philosophy is to progressively adapt heterogeneous comprehension and generation knowledge to pre-trained large language models (LLMs). This is achieved through a novel heterogeneous low-r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09838v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09838v1-abstract-full" style="display: none;"> We present HealthGPT, a powerful Medical Large Vision-Language Model (Med-LVLM) that integrates medical visual comprehension and generation capabilities within a unified autoregressive paradigm. Our bootstrapping philosophy is to progressively adapt heterogeneous comprehension and generation knowledge to pre-trained large language models (LLMs). This is achieved through a novel heterogeneous low-rank adaptation (H-LoRA) technique, which is complemented by a tailored hierarchical visual perception approach and a three-stage learning strategy. To effectively learn the HealthGPT, we devise a comprehensive medical domain-specific comprehension and generation dataset called VL-Health. Experimental results demonstrate exceptional performance and scalability of HealthGPT in medical visual unified tasks. Our project can be accessed at https://github.com/DCDmllm/HealthGPT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09838v1-abstract-full').style.display = 'none'; document.getElementById('2502.09838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09662">arXiv:2502.09662</a> <span> [<a href="https://arxiv.org/pdf/2502.09662">pdf</a>, <a href="https://arxiv.org/format/2502.09662">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Cervical Cancer Screening via Large-scale Pretraining and Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huangjing Lin</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yanning Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jiabo Ma</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+L">Li Ding</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Jun Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Runsheng Liu</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+Z">Zhizhong Chai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Huijuan Shi</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yinling Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changzhong Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+A">Anjia Han</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+R+C+K">Ronald Cheong Kin Chan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09662v1-abstract-short" style="display: inline;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09662v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09662v1-abstract-full" style="display: none;"> Cervical cancer is a leading malignancy in female reproductive system. While AI-assisted cytology offers a cost-effective and non-invasive screening solution, current systems struggle with generalizability in complex clinical scenarios. To address this issue, we introduced Smart-CCS, a generalizable Cervical Cancer Screening paradigm based on pretraining and adaptation to create robust and generalizable screening systems. To develop and validate Smart-CCS, we first curated a large-scale, multi-center dataset named CCS-127K, which comprises a total of 127,471 cervical cytology whole-slide images collected from 48 medical centers. By leveraging large-scale self-supervised pretraining, our CCS models are equipped with strong generalization capability, potentially generalizing across diverse scenarios. Then, we incorporated test-time adaptation to specifically optimize the trained CCS model for complex clinical settings, which adapts and refines predictions, improving real-world applicability. We conducted large-scale system evaluation among various cohorts. In retrospective cohorts, Smart-CCS achieved an overall area under the curve (AUC) value of 0.965 and sensitivity of 0.913 for cancer screening on 11 internal test datasets. In external testing, system performance maintained high at 0.950 AUC across 6 independent test datasets. In prospective cohorts, our Smart-CCS achieved AUCs of 0.947, 0.924, and 0.986 in three prospective centers, respectively. Moreover, the system demonstrated superior sensitivity in diagnosing cervical cancer, confirming the accuracy of our cancer screening results by using histology findings for validation. Interpretability analysis with cell and slide predictions further indicated that the system's decision-making aligns with clinical practice. Smart-CCS represents a significant advancement in cancer screening across diverse clinical contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09662v1-abstract-full').style.display = 'none'; document.getElementById('2502.09662v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08991">arXiv:2502.08991</a> <span> [<a href="https://arxiv.org/pdf/2502.08991">pdf</a>, <a href="https://arxiv.org/format/2502.08991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Task Generalization With AutoRegressive Compositional Structure: Can Learning From $\d$ Tasks Generalize to $\d^{T}$ Tasks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abedsoltan%2C+A">Amirhesam Abedsoltan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huaqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+K">Kaiyue Wen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhou Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingzhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Belkin%2C+M">Mikhail Belkin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08991v1-abstract-short" style="display: inline;"> Large language models (LLMs) exhibit remarkable task generalization, solving tasks they were never explicitly trained on with only a few demonstrations. This raises a fundamental question: When can learning from a small set of tasks generalize to a large task family? In this paper, we investigate task generalization through the lens of AutoRegressive Compositional (ARC) structure, where each task… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08991v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08991v1-abstract-full" style="display: none;"> Large language models (LLMs) exhibit remarkable task generalization, solving tasks they were never explicitly trained on with only a few demonstrations. This raises a fundamental question: When can learning from a small set of tasks generalize to a large task family? In this paper, we investigate task generalization through the lens of AutoRegressive Compositional (ARC) structure, where each task is a composition of $T$ operations, and each operation is among a finite family of $\d$ subtasks. This yields a total class of size~\( \d^\TT \). We first show that generalization to all \( \d^\TT \) tasks is theoretically achievable by training on only \( \tilde{O}(\d) \) tasks. Empirically, we demonstrate that Transformers achieve such exponential task generalization on sparse parity functions via in-context learning (ICL) and Chain-of-Thought (CoT) reasoning. We further demonstrate this generalization in arithmetic and language translation, extending beyond parity functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08991v1-abstract-full').style.display = 'none'; document.getElementById('2502.08991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08888">arXiv:2502.08888</a> <span> [<a href="https://arxiv.org/pdf/2502.08888">pdf</a>, <a href="https://arxiv.org/format/2502.08888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-Enhanced Multiple Instance Learning for Joint Rumor and Stance Detection with Social Context Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruichao Yang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Wei Gao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08888v1-abstract-short" style="display: inline;"> The proliferation of misinformation, such as rumors on social media, has drawn significant attention, prompting various expressions of stance among users. Although rumor detection and stance detection are distinct tasks, they can complement each other. Rumors can be identified by cross-referencing stances in related posts, and stances are influenced by the nature of the rumor. However, existing st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08888v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08888v1-abstract-full" style="display: none;"> The proliferation of misinformation, such as rumors on social media, has drawn significant attention, prompting various expressions of stance among users. Although rumor detection and stance detection are distinct tasks, they can complement each other. Rumors can be identified by cross-referencing stances in related posts, and stances are influenced by the nature of the rumor. However, existing stance detection methods often require post-level stance annotations, which are costly to obtain. We propose a novel LLM-enhanced MIL approach to jointly predict post stance and claim class labels, supervised solely by claim labels, using an undirected microblog propagation model. Our weakly supervised approach relies only on bag-level labels of claim veracity, aligning with multi-instance learning (MIL) principles. To achieve this, we transform the multi-class problem into multiple MIL-based binary classification problems. We then employ a discriminative attention layer to aggregate the outputs from these classifiers into finer-grained classes. Experiments conducted on three rumor datasets and two stance datasets demonstrate the effectiveness of our approach, highlighting strong connections between rumor veracity and expressed stances in responding posts. Our method shows promising performance in joint rumor and stance detection compared to the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08888v1-abstract-full').style.display = 'none'; document.getElementById('2502.08888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM TIST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08019">arXiv:2502.08019</a> <span> [<a href="https://arxiv.org/pdf/2502.08019">pdf</a>, <a href="https://arxiv.org/format/2502.08019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Connectivity of LEO Satellite Mega Constellations: An Application of Percolation Theory on a Sphere </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hao Lin</a>, <a href="/search/cs?searchtype=author&query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08019v1-abstract-short" style="display: inline;"> With the advent of the 6G era, global connectivity has become a common goal in the evolution of communications, aiming to bring Internet services to more unconnected regions. Additionally, the rise of applications such as the Internet of Everything and remote education also requires global connectivity. Non-terrestrial networks (NTN), particularly low earth orbit (LEO) satellites, play a crucial r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08019v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08019v1-abstract-full" style="display: none;"> With the advent of the 6G era, global connectivity has become a common goal in the evolution of communications, aiming to bring Internet services to more unconnected regions. Additionally, the rise of applications such as the Internet of Everything and remote education also requires global connectivity. Non-terrestrial networks (NTN), particularly low earth orbit (LEO) satellites, play a crucial role in this future vision. Although some literature already analyze the coverage performance using stochastic geometry, the ability of generating large-scale continuous service area is still expected to analyze. Therefore, in this paper, we mainly investigate the necessary conditions of LEO satellite deployment for large-scale continuous service coverage on the earth. Firstly, we apply percolation theory to a closed spherical surface and define the percolation on a sphere for the first time. We introduce the sub-critical and super-critical cases to prove the existence of the phase transition of percolation probability. Then, through stereographic projection, we introduce the tight bounds and closed-form expression of the critical number of LEO satellites on the same constellation. In addition, we also investigate how the altitude and maximum slant range of LEO satellites affect percolation probability, and derive the critical values of them. Based on our findings, we provide useful recommendations for companies planning to deploy LEO satellite networks to enhance connectivity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08019v1-abstract-full').style.display = 'none'; document.getElementById('2502.08019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08010">arXiv:2502.08010</a> <span> [<a href="https://arxiv.org/pdf/2502.08010">pdf</a>, <a href="https://arxiv.org/format/2502.08010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Energy-as-a-Service for RF-Powered IoE Networks: A Percolation Theory Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hao Lin</a>, <a href="/search/cs?searchtype=author&query=Zhaikhan%2C+A">Ainur Zhaikhan</a>, <a href="/search/cs?searchtype=author&query=Kishk%2C+M+A">Mustafa A. Kishk</a>, <a href="/search/cs?searchtype=author&query=ElSawy%2C+H">Hesham ElSawy</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08010v1-abstract-short" style="display: inline;"> Due to the involved massive number of devices, radio frequency (RF) energy harvesting is indispensable to realize the foreseen Internet-of-Everything (IoE) within 6G networks. Analogous to the cellular networks concept, shared energy stations (ESs) are foreseen to supply energy-as-a-service (EaaS) in order to recharge devices that belong to different IoE operators who are offering diverse use case… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08010v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08010v1-abstract-full" style="display: none;"> Due to the involved massive number of devices, radio frequency (RF) energy harvesting is indispensable to realize the foreseen Internet-of-Everything (IoE) within 6G networks. Analogous to the cellular networks concept, shared energy stations (ESs) are foreseen to supply energy-as-a-service (EaaS) in order to recharge devices that belong to different IoE operators who are offering diverse use cases. Considering the capital expenditure (CAPEX) for ES deployment along with their finite wireless energy transfer (WET) zones, spatial energy gaps are plausible. Furthermore, the ESs deployment cannot cover 100% of the energy-harvesting devices of all coexisting IoE use cases. In this context, we utilize percolation theory to characterize the feasibility of large-scale device-to-device (D2D) connectivity of IoE networks operating under EaaS platforms. Assuming that ESs and IoE devices follow independent Poisson point processes (PPPs), we construct a connectivity graph for the IoE devices that are within the WET zones of ESs. Continuum percolation on the construct graph is utilized to derive necessary and sufficient conditions for large-scale RF-powered D2D connectivity in terms of the required IoE device density and communication range along with the required ESs density and WET zone size. Fixing the IoE network parameters along with the size of WET zones, we obtain the approximate critical value of the ES density that ensures large-scale connectivity using the inner-city and Gilbert disk models. By imitating the bounds and combining the approximations, we construct an approximate expression for the critical ES density function, which is necessary to minimize the EaaS CAPEX under the IoE connectivity constraint. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08010v1-abstract-full').style.display = 'none'; document.getElementById('2502.08010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07640">arXiv:2502.07640</a> <span> [<a href="https://arxiv.org/pdf/2502.07640">pdf</a>, <a href="https://arxiv.org/format/2502.07640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Goedel-Prover: A Frontier Model for Open-Source Automated Theorem Proving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bohan Lyu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiayun Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhou Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaiyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Sanjeev Arora</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07640v2-abstract-short" style="display: inline;"> We introduce Goedel-Prover, an open-source large language model (LLM) that achieves the state-of-the-art (SOTA) performance in automated formal proof generation for mathematical problems. The key challenge in this field is the scarcity of formalized math statements and proofs, which we tackle in the following ways. We train statement formalizers to translate the natural language math problems from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07640v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07640v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07640v2-abstract-full" style="display: none;"> We introduce Goedel-Prover, an open-source large language model (LLM) that achieves the state-of-the-art (SOTA) performance in automated formal proof generation for mathematical problems. The key challenge in this field is the scarcity of formalized math statements and proofs, which we tackle in the following ways. We train statement formalizers to translate the natural language math problems from Numina into formal language (Lean 4), creating a dataset of 1.64 million formal statements. LLMs are used to check that the formal statements accurately preserve the content of the original natural language problems. We then iteratively build a large dataset of formal proofs by training a series of provers. Each prover succeeds in proving many statements that the previous ones could not, and these new proofs are added to the training set for the next prover. Despite using only supervised fine-tuning, our final prover significantly outperforms the previous best open-source model, DeepSeek-Prover-V1.5, which employs reinforcement learning. On the miniF2F benchmark, our model achieves a success rate of 57.6% (Pass@32), surpassing DeepSeek-Prover-V1.5 by 7.6%. On PutnamBench, Goedel-Prover successfully solves 7 problems (Pass@512), ranking first on the leaderboard. Furthermore, it generates 29.7K formal proofs for Lean Workbook problems, nearly doubling the 15.7K produced by earlier works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07640v2-abstract-full').style.display = 'none'; document.getElementById('2502.07640v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07556">arXiv:2502.07556</a> <span> [<a href="https://arxiv.org/pdf/2502.07556">pdf</a>, <a href="https://arxiv.org/format/2502.07556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SketchFlex: Facilitating Spatial-Semantic Coherence in Text-to-Image Generation with Region-Based Sketches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haichuan Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yilin Ye</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jiazhi Xia</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wei Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07556v1-abstract-short" style="display: inline;"> Text-to-image models can generate visually appealing images from text descriptions. Efforts have been devoted to improving model controls with prompt tuning and spatial conditioning. However, our formative study highlights the challenges for non-expert users in crafting appropriate prompts and specifying fine-grained spatial conditions (e.g., depth or canny references) to generate semantically coh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07556v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07556v1-abstract-full" style="display: none;"> Text-to-image models can generate visually appealing images from text descriptions. Efforts have been devoted to improving model controls with prompt tuning and spatial conditioning. However, our formative study highlights the challenges for non-expert users in crafting appropriate prompts and specifying fine-grained spatial conditions (e.g., depth or canny references) to generate semantically cohesive images, especially when multiple objects are involved. In response, we introduce SketchFlex, an interactive system designed to improve the flexibility of spatially conditioned image generation using rough region sketches. The system automatically infers user prompts with rational descriptions within a semantic space enriched by crowd-sourced object attributes and relationships. Additionally, SketchFlex refines users' rough sketches into canny-based shape anchors, ensuring the generation quality and alignment of user intentions. Experimental results demonstrate that SketchFlex achieves more cohesive image generations than end-to-end models, meanwhile significantly reducing cognitive load and better matching user intentions compared to region-based generation baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07556v1-abstract-full').style.display = 'none'; document.getElementById('2502.07556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference: CHI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07216">arXiv:2502.07216</a> <span> [<a href="https://arxiv.org/pdf/2502.07216">pdf</a>, <a href="https://arxiv.org/format/2502.07216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681043">10.1145/3664647.3681043 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SparseFormer: Detecting Objects in HRW Shots via Sparse Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenxi Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuchen Guo</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jilai Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haozhe Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+L">Lu Fang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07216v1-abstract-short" style="display: inline;"> Recent years have seen an increase in the use of gigapixel-level image and video capture systems and benchmarks with high-resolution wide (HRW) shots. However, unlike close-up shots in the MS COCO dataset, the higher resolution and wider field of view raise unique challenges, such as extreme sparsity and huge scale changes, causing existing close-up detectors inaccuracy and inefficiency. In this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07216v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07216v1-abstract-full" style="display: none;"> Recent years have seen an increase in the use of gigapixel-level image and video capture systems and benchmarks with high-resolution wide (HRW) shots. However, unlike close-up shots in the MS COCO dataset, the higher resolution and wider field of view raise unique challenges, such as extreme sparsity and huge scale changes, causing existing close-up detectors inaccuracy and inefficiency. In this paper, we present a novel model-agnostic sparse vision transformer, dubbed SparseFormer, to bridge the gap of object detection between close-up and HRW shots. The proposed SparseFormer selectively uses attentive tokens to scrutinize the sparsely distributed windows that may contain objects. In this way, it can jointly explore global and local attention by fusing coarse- and fine-grained features to handle huge scale changes. SparseFormer also benefits from a novel Cross-slice non-maximum suppression (C-NMS) algorithm to precisely localize objects from noisy windows and a simple yet effective multi-scale strategy to improve accuracy. Extensive experiments on two HRW benchmarks, PANDA and DOTA-v1.0, demonstrate that the proposed SparseFormer significantly improves detection accuracy (up to 5.8%) and speed (up to 3x) over the state-of-the-art approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07216v1-abstract-full').style.display = 'none'; document.getElementById('2502.07216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted to ACM MM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06913">arXiv:2502.06913</a> <span> [<a href="https://arxiv.org/pdf/2502.06913">pdf</a>, <a href="https://arxiv.org/format/2502.06913">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Simple yet Effective DDG Predictor is An Unsupervised Antibody Optimizer and Explainer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunfan Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haitao Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yufei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guojiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06913v2-abstract-short" style="display: inline;"> The proteins that exist today have been optimized over billions of years of natural evolution, during which nature creates random mutations and selects them. The discovery of functionally promising mutations is challenged by the limited evolutionary accessible regions, i.e., only a small region on the fitness landscape is beneficial. There have been numerous priors used to constrain protein evolut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06913v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06913v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06913v2-abstract-full" style="display: none;"> The proteins that exist today have been optimized over billions of years of natural evolution, during which nature creates random mutations and selects them. The discovery of functionally promising mutations is challenged by the limited evolutionary accessible regions, i.e., only a small region on the fitness landscape is beneficial. There have been numerous priors used to constrain protein evolution to regions of landscapes with high-fitness variants, among which the change in binding free energy (DDG) of protein complexes upon mutations is one of the most commonly used priors. However, the huge mutation space poses two challenges: (1) how to improve the efficiency of DDG prediction for fast mutation screening; and (2) how to explain mutation preferences and efficiently explore accessible evolutionary regions. To address these challenges, we propose a lightweight DDG predictor (Light-DDG), which adopts a structure-aware Transformer as the backbone and enhances it by knowledge distilled from existing powerful but computationally heavy DDG predictors. Additionally, we augmented, annotated, and released a large-scale dataset containing millions of mutation data for pre-training Light-DDG. We find that such a simple yet effective Light-DDG can serve as a good unsupervised antibody optimizer and explainer. For the target antibody, we propose a novel Mutation Explainer to learn mutation preferences, which accounts for the marginal benefit of each mutation per residue. To further explore accessible evolutionary regions, we conduct preference-guided antibody optimization and evaluate antibody candidates quickly using Light-DDG to identify desirable mutations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06913v2-abstract-full').style.display = 'none'; document.getElementById('2502.06913v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06870">arXiv:2502.06870</a> <span> [<a href="https://arxiv.org/pdf/2502.06870">pdf</a>, <a href="https://arxiv.org/format/2502.06870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bridging Traffic State and Trajectory for Dynamic Road Network and Trajectory Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+C">Chengkai Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongyao Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xie Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hao Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junjie Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06870v1-abstract-short" style="display: inline;"> Effective urban traffic management is vital for sustainable city development, relying on intelligent systems with machine learning tasks such as traffic flow prediction and travel time estimation. Traditional approaches usually focus on static road network and trajectory representation learning, and overlook the dynamic nature of traffic states and trajectories, which is crucial for downstream tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06870v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06870v1-abstract-full" style="display: none;"> Effective urban traffic management is vital for sustainable city development, relying on intelligent systems with machine learning tasks such as traffic flow prediction and travel time estimation. Traditional approaches usually focus on static road network and trajectory representation learning, and overlook the dynamic nature of traffic states and trajectories, which is crucial for downstream tasks. To address this gap, we propose TRACK, a novel framework to bridge traffic state and trajectory data for dynamic road network and trajectory representation learning. TRACK leverages graph attention networks (GAT) to encode static and spatial road segment features, and introduces a transformer-based model for trajectory representation learning. By incorporating transition probabilities from trajectory data into GAT attention weights, TRACK captures dynamic spatial features of road segments. Meanwhile, TRACK designs a traffic transformer encoder to capture the spatial-temporal dynamics of road segments from traffic state data. To further enhance dynamic representations, TRACK proposes a co-attentional transformer encoder and a trajectory-traffic state matching task. Extensive experiments on real-life urban traffic datasets demonstrate the superiority of TRACK over state-of-the-art baselines. Case studies confirm TRACK's ability to capture spatial-temporal dynamics effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06870v1-abstract-full').style.display = 'none'; document.getElementById('2502.06870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.m </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06663">arXiv:2502.06663</a> <span> [<a href="https://arxiv.org/pdf/2502.06663">pdf</a>, <a href="https://arxiv.org/format/2502.06663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EfficientLLM: Scalable Pruning-Aware Pretraining for Architecture-Agnostic Edge Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xing%2C+X">Xingrun Xing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shitao Xiao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Boyan Gao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wanpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haokun Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiajun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06663v2-abstract-short" style="display: inline;"> Modern large language models (LLMs) driven by scaling laws, achieve intelligence emergency in large model sizes. Recently, the increasing concerns about cloud costs, latency, and privacy make it an urgent requirement to develop compact edge language models. Distinguished from direct pretraining that bounded by the scaling law, this work proposes the pruning-aware pretraining, focusing on retaining… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06663v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06663v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06663v2-abstract-full" style="display: none;"> Modern large language models (LLMs) driven by scaling laws, achieve intelligence emergency in large model sizes. Recently, the increasing concerns about cloud costs, latency, and privacy make it an urgent requirement to develop compact edge language models. Distinguished from direct pretraining that bounded by the scaling law, this work proposes the pruning-aware pretraining, focusing on retaining performance of much larger optimized models. It features following characteristics: 1) Data-scalable: we introduce minimal parameter groups in LLM and continuously optimize structural pruning, extending post-training pruning methods like LLM-Pruner and SparseGPT into the pretraining phase. 2) Architecture-agnostic: the LLM architecture is auto-designed using saliency-driven pruning, which is the first time to exceed SoTA human-designed LLMs in modern pretraining. We reveal that it achieves top-quality edge language models, termed EfficientLLM, by scaling up LLM compression and extending its boundary. EfficientLLM significantly outperforms SoTA baselines with $100M \sim 1B$ parameters, such as MobileLLM, SmolLM, Qwen2.5-0.5B, OLMo-1B, Llama3.2-1B in common sense benchmarks. As the first attempt, EfficientLLM bridges the performance gap between traditional LLM compression and direct pretraining methods, and we will fully open source at https://github.com/Xingrun-Xing2/EfficientLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06663v2-abstract-full').style.display = 'none'; document.getElementById('2502.06663v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06207">arXiv:2502.06207</a> <span> [<a href="https://arxiv.org/pdf/2502.06207">pdf</a>, <a href="https://arxiv.org/format/2502.06207">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Capabilities of Large Language Models in Detecting Offensive Language with Annotation Disagreement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junyu Lu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kai Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaichun Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kelaiti Xiao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+R+K">Roy Ka-Wei Lee</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bo Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liang Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongfei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06207v1-abstract-short" style="display: inline;"> LLMs are widely used for offensive language detection due to their advanced capability. However, the challenges posed by human annotation disagreement in real-world datasets remain underexplored. These disagreement samples are difficult to detect due to their ambiguous nature. Additionally, the confidence of LLMs in processing disagreement samples can provide valuable insights into their alignment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06207v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06207v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06207v1-abstract-full" style="display: none;"> LLMs are widely used for offensive language detection due to their advanced capability. However, the challenges posed by human annotation disagreement in real-world datasets remain underexplored. These disagreement samples are difficult to detect due to their ambiguous nature. Additionally, the confidence of LLMs in processing disagreement samples can provide valuable insights into their alignment with human annotators. To address this gap, we systematically evaluate the ability of LLMs to detect offensive language with annotation disagreement. We compare the binary accuracy of multiple LLMs across varying annotation agreement levels and analyze the relationship between LLM confidence and annotation agreement. Furthermore, we investigate the impact of disagreement samples on LLM decision-making during few-shot learning and instruction fine-tuning. Our findings highlight the challenges posed by disagreement samples and offer guidance for improving LLM-based offensive language detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06207v1-abstract-full').style.display = 'none'; document.getElementById('2502.06207v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, submitted to the ACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04960">arXiv:2502.04960</a> <span> [<a href="https://arxiv.org/pdf/2502.04960">pdf</a>, <a href="https://arxiv.org/format/2502.04960">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Commonality and Individuality! Integrating Humor Commonality with Speaker Individuality for Humor Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haohao Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junyu Lu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zeyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zewen Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liang Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongfei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04960v1-abstract-short" style="display: inline;"> Humor recognition aims to identify whether a specific speaker's text is humorous. Current methods for humor recognition mainly suffer from two limitations: (1) they solely focus on one aspect of humor commonalities, ignoring the multifaceted nature of humor; and (2) they typically overlook the critical role of speaker individuality, which is essential for a comprehensive understanding of humor exp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04960v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04960v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04960v1-abstract-full" style="display: none;"> Humor recognition aims to identify whether a specific speaker's text is humorous. Current methods for humor recognition mainly suffer from two limitations: (1) they solely focus on one aspect of humor commonalities, ignoring the multifaceted nature of humor; and (2) they typically overlook the critical role of speaker individuality, which is essential for a comprehensive understanding of humor expressions. To bridge these gaps, we introduce the Commonality and Individuality Incorporated Network for Humor Recognition (CIHR), a novel model designed to enhance humor recognition by integrating multifaceted humor commonalities with the distinctive individuality of speakers. The CIHR features a Humor Commonality Analysis module that explores various perspectives of multifaceted humor commonality within user texts, and a Speaker Individuality Extraction module that captures both static and dynamic aspects of a speaker's profile to accurately model their distinctive individuality. Additionally, Static and Dynamic Fusion modules are introduced to effectively incorporate the humor commonality with speaker's individuality in the humor recognition process. Extensive experiments demonstrate the effectiveness of CIHR, underscoring the importance of concurrently addressing both multifaceted humor commonality and distinctive speaker individuality in humor recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04960v1-abstract-full').style.display = 'none'; document.getElementById('2502.04960v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04128">arXiv:2502.04128</a> <span> [<a href="https://arxiv.org/pdf/2502.04128">pdf</a>, <a href="https://arxiv.org/format/2502.04128">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+C">Chi-Min Chan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jiahe Lei</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yi Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haohe Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yizhu Jin</a>, <a href="/search/cs?searchtype=author&query=DAI%2C+Z">Zheqi DAI</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianyi Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xingjian Du</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunlin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhifei Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yike Guo</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+W">Wei Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04128v1-abstract-short" style="display: inline;"> Recent advances in text-based large language models (LLMs), particularly in the GPT series and the o1 model, have demonstrated the effectiveness of scaling both training-time and inference-time compute. However, current state-of-the-art TTS systems leveraging LLMs are often multi-stage, requiring separate models (e.g., diffusion models after LLM), complicating the decision of whether to scale a pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04128v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04128v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04128v1-abstract-full" style="display: none;"> Recent advances in text-based large language models (LLMs), particularly in the GPT series and the o1 model, have demonstrated the effectiveness of scaling both training-time and inference-time compute. However, current state-of-the-art TTS systems leveraging LLMs are often multi-stage, requiring separate models (e.g., diffusion models after LLM), complicating the decision of whether to scale a particular model during training or testing. This work makes the following contributions: First, we explore the scaling of train-time and inference-time compute for speech synthesis. Second, we propose a simple framework Llasa for speech synthesis that employs a single-layer vector quantizer (VQ) codec and a single Transformer architecture to fully align with standard LLMs such as Llama. Our experiments reveal that scaling train-time compute for Llasa consistently improves the naturalness of synthesized speech and enables the generation of more complex and accurate prosody patterns. Furthermore, from the perspective of scaling inference-time compute, we employ speech understanding models as verifiers during the search, finding that scaling inference-time compute shifts the sampling modes toward the preferences of specific verifiers, thereby improving emotional expressiveness, timbre consistency, and content accuracy. In addition, we released the checkpoint and training code for our TTS model (1B, 3B, 8B) and codec model publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04128v1-abstract-full').style.display = 'none'; document.getElementById('2502.04128v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03806">arXiv:2502.03806</a> <span> [<a href="https://arxiv.org/pdf/2502.03806">pdf</a>, <a href="https://arxiv.org/format/2502.03806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Should Code Models Learn Pedagogically? A Preliminary Evaluation of Curriculum Learning for Real-World Software Engineering Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khant%2C+K+S">Kyi Shin Khant</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H+Y">Hong Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Thongtanunam%2C+P">Patanamon Thongtanunam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03806v1-abstract-short" style="display: inline;"> Learning-based techniques, especially advanced pre-trained models for code have demonstrated capabilities in code understanding and generation, solving diverse software engineering (SE) tasks. Despite the promising results, current training approaches may not fully optimize model performance, as they typically involve learning from randomly shuffled training data. Recent work shows that Curriculum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03806v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03806v1-abstract-full" style="display: none;"> Learning-based techniques, especially advanced pre-trained models for code have demonstrated capabilities in code understanding and generation, solving diverse software engineering (SE) tasks. Despite the promising results, current training approaches may not fully optimize model performance, as they typically involve learning from randomly shuffled training data. Recent work shows that Curriculum Learning (CL) can improve performance on code-related tasks through incremental learning based on the difficulty of synthetic code. Yet, the effectiveness of CL with conventional difficulty measures in SE tasks remains largely unexplored. In this study, we explore two conventional code metrics: code length and cyclomatic complexity to determine the difficulty levels. We investigate how the pre-trained code model (CodeT5) learns under CL, through the tasks of code clone detection and code summarization. Our empirical study on the CodeXGLUE benchmark showed contrasting results to prior studies, where the model exhibited signs of catastrophic forgetting and shortcut learning. Surprisingly, model performance saturates after only the first quartile of training, potentially indicating a limit in the model's representation capacity and/or the task's inherent difficulty. Future work should further explore various CL strategies with different code models across a wider range of SE tasks for a more holistic understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03806v1-abstract-full').style.display = 'none'; document.getElementById('2502.03806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 22nd International Conference on Mining Software Repositories (MSR 25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03550">arXiv:2502.03550</a> <span> [<a href="https://arxiv.org/pdf/2502.03550">pdf</a>, <a href="https://arxiv.org/format/2502.03550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> TD-M(PC)$^2$: Improving Temporal Difference MPC Through Policy Constraint </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haotian Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengcheng Wang</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+J">Jeff Schneider</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Guanya Shi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03550v1-abstract-short" style="display: inline;"> Model-based reinforcement learning algorithms that combine model-based planning and learned value/policy prior have gained significant recognition for their high data efficiency and superior performance in continuous control. However, we discover that existing methods that rely on standard SAC-style policy iteration for value learning, directly using data generated by the planner, often result in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03550v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03550v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03550v1-abstract-full" style="display: none;"> Model-based reinforcement learning algorithms that combine model-based planning and learned value/policy prior have gained significant recognition for their high data efficiency and superior performance in continuous control. However, we discover that existing methods that rely on standard SAC-style policy iteration for value learning, directly using data generated by the planner, often result in \emph{persistent value overestimation}. Through theoretical analysis and experiments, we argue that this issue is deeply rooted in the structural policy mismatch between the data generation policy that is always bootstrapped by the planner and the learned policy prior. To mitigate such a mismatch in a minimalist way, we propose a policy regularization term reducing out-of-distribution (OOD) queries, thereby improving value learning. Our method involves minimum changes on top of existing frameworks and requires no additional computation. Extensive experiments demonstrate that the proposed approach improves performance over baselines such as TD-MPC2 by large margins, particularly in 61-DoF humanoid tasks. View qualitative results at https://darthutopian.github.io/tdmpc_square/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03550v1-abstract-full').style.display = 'none'; document.getElementById('2502.03550v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02757">arXiv:2502.02757</a> <span> [<a href="https://arxiv.org/pdf/2502.02757">pdf</a>, <a href="https://arxiv.org/format/2502.02757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Too Noisy To Learn: Enhancing Data Quality for Code Review Comment Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunhua Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H+Y">Hong Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Thongtanunam%2C+P">Patanamon Thongtanunam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02757v2-abstract-short" style="display: inline;"> Code review is an important practice in software development, yet it is time-consuming and requires substantial effort. While open-source datasets have been used to train neural models for automating code review tasks, including review comment generation, these datasets contain a significant amount of noisy comments (e.g., vague or non-actionable feedback) that persist despite cleaning methods usi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02757v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02757v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02757v2-abstract-full" style="display: none;"> Code review is an important practice in software development, yet it is time-consuming and requires substantial effort. While open-source datasets have been used to train neural models for automating code review tasks, including review comment generation, these datasets contain a significant amount of noisy comments (e.g., vague or non-actionable feedback) that persist despite cleaning methods using heuristics and machine learning approaches. Such remaining noise may lead models to generate low-quality review comments, yet removing them requires a complex semantic understanding of both code changes and natural language comments. In this paper, we investigate the impact of such noise on review comment generation and propose a novel approach using large language models (LLMs) to further clean these datasets. Based on an empirical study on a large-scale code review dataset, our LLM-based approach achieves 66-85% precision in detecting valid comments. Using the predicted valid comments to fine-tune the state-of-the-art code review models (cleaned models) can generate review comments that are 13.0% - 12.4% more similar to valid human-written comments than the original models. We also find that the cleaned models can generate more informative and relevant comments than the original models. Our findings underscore the critical impact of dataset quality on the performance of review comment generation. We advocate for further research into cleaning training data to enhance the practical utility and quality of automated code review. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02757v2-abstract-full').style.display = 'none'; document.getElementById('2502.02757v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is published at the International Conference on Mining Software Repositories (MSR2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02458">arXiv:2502.02458</a> <span> [<a href="https://arxiv.org/pdf/2502.02458">pdf</a>, <a href="https://arxiv.org/format/2502.02458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAISA: Towards Multimodal Large Language Models with Both Training and Inference Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Q">Qianhao Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanjiang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02458v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) mainly fall into two architectures, each involving a trade-off between training and inference efficiency: embedding space alignment (e.g., LLaVA-1.5) is inefficient during inference, while cross-attention space alignment (e.g., Flamingo) is inefficient in training. In this paper, we compare these two architectures and identify the key factors for building e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02458v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02458v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) mainly fall into two architectures, each involving a trade-off between training and inference efficiency: embedding space alignment (e.g., LLaVA-1.5) is inefficient during inference, while cross-attention space alignment (e.g., Flamingo) is inefficient in training. In this paper, we compare these two architectures and identify the key factors for building efficient MLLMs. A primary difference between them lies in how attention is applied to visual tokens, particularly in their interactions with each other. To investigate whether attention among visual tokens is necessary, we propose a new self-attention mechanism, NAAViT (\textbf{N}o \textbf{A}ttention \textbf{A}mong \textbf{Vi}sual \textbf{T}okens), which eliminates this type of attention. Our pilot experiment on LLaVA-1.5 shows that attention among visual tokens is highly redundant. Based on these insights, we introduce SAISA (\textbf{S}elf-\textbf{A}ttention \textbf{I}nput \textbf{S}pace \textbf{A}lignment), a novel architecture that enhance both training and inference efficiency. SAISA directly aligns visual features with the input spaces of NAAViT self-attention blocks, reducing computational overhead in both self-attention blocks and feed-forward networks (FFNs). Using the same configuration as LLaVA-1.5, SAISA reduces inference FLOPs by 66\% and training budget by 26\%, while achieving superior performance in terms of accuracy. Comprehensive ablation studies further validate the effectiveness of SAISA across various LLMs and visual encoders. The code and model will be publicly available at https://github.com/icip-cas/SAISA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02458v1-abstract-full').style.display = 'none'; document.getElementById('2502.02458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02302">arXiv:2502.02302</a> <span> [<a href="https://arxiv.org/pdf/2502.02302">pdf</a>, <a href="https://arxiv.org/format/2502.02302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EdgeGFL: Rethinking Edge Information in Graph Feature Preference Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhuo%2C+S">Shengda Zhuo</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jiwang Fang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongguang Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yin Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Changdong Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuqiang Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02302v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have significant advantages in handling non-Euclidean data and have been widely applied across various areas, thus receiving increasing attention in recent years. The framework of GNN models mainly includes the information propagation phase and the aggregation phase, treating nodes and edges as information entities and propagation channels, respectively. However, most… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02302v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02302v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have significant advantages in handling non-Euclidean data and have been widely applied across various areas, thus receiving increasing attention in recent years. The framework of GNN models mainly includes the information propagation phase and the aggregation phase, treating nodes and edges as information entities and propagation channels, respectively. However, most existing GNN models face the challenge of disconnection between node and edge feature information, as these models typically treat the learning of edge and node features as independent tasks. To address this limitation, we aim to develop an edge-empowered graph feature preference learning framework that can capture edge embeddings to assist node embeddings. By leveraging the learned multidimensional edge feature matrix, we construct multi-channel filters to more effectively capture accurate node features, thereby obtaining the non-local structural characteristics and fine-grained high-order node features. Specifically, the inclusion of multidimensional edge information enhances the functionality and flexibility of the GNN model, enabling it to handle complex and diverse graph data more effectively. Additionally, integrating relational representation learning into the message passing framework allows graph nodes to receive more useful information, thereby facilitating node representation learning. Finally, experiments on four real-world heterogeneous graphs demonstrate the effectiveness of theproposed model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02302v1-abstract-full').style.display = 'none'; document.getElementById('2502.02302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01634">arXiv:2502.01634</a> <span> [<a href="https://arxiv.org/pdf/2502.01634">pdf</a>, <a href="https://arxiv.org/format/2502.01634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Online Gradient Boosting Decision Tree: In-Place Updates for Efficient Adding/Deleting Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huawei Lin</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+J+W">Jun Woo Chung</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+Y">Yingjie Lao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Weijie Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01634v1-abstract-short" style="display: inline;"> Gradient Boosting Decision Tree (GBDT) is one of the most popular machine learning models in various applications. However, in the traditional settings, all data should be simultaneously accessed in the training procedure: it does not allow to add or delete any data instances after training. In this paper, we propose an efficient online learning framework for GBDT supporting both incremental and d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01634v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01634v1-abstract-full" style="display: none;"> Gradient Boosting Decision Tree (GBDT) is one of the most popular machine learning models in various applications. However, in the traditional settings, all data should be simultaneously accessed in the training procedure: it does not allow to add or delete any data instances after training. In this paper, we propose an efficient online learning framework for GBDT supporting both incremental and decremental learning. To the best of our knowledge, this is the first work that considers an in-place unified incremental and decremental learning on GBDT. To reduce the learning cost, we present a collection of optimizations for our framework, so that it can add or delete a small fraction of data on the fly. We theoretically show the relationship between the hyper-parameters of the proposed optimizations, which enables trading off accuracy and cost on incremental and decremental learning. The backdoor attack results show that our framework can successfully inject and remove backdoor in a well-trained model using incremental and decremental learning, and the empirical results on public datasets confirm the effectiveness and efficiency of our proposed online learning framework and optimizations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01634v1-abstract-full').style.display = 'none'; document.getElementById('2502.01634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 11 figures, 16 tables. Keywords: Decremental Learning, Incremental Learning, Machine Unlearning, Online Learning, Gradient Boosting Decision Trees, GBDTs</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01142">arXiv:2502.01142</a> <span> [<a href="https://arxiv.org/pdf/2502.01142">pdf</a>, <a href="https://arxiv.org/format/2502.01142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> DeepRAG: Thinking to Retrieval Step by Step for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xinyan Guan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jiali Zeng</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+F">Fandong Meng</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+C">Chunlei Xin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01142v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable potential in reasoning while they still suffer from severe factual hallucinations due to timeliness, accuracy, and coverage of parametric knowledge. Meanwhile, integrating reasoning with retrieval-augmented generation (RAG) remains challenging due to ineffective task decomposition and redundant retrieval, which can introduce noise and degrade resp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01142v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01142v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable potential in reasoning while they still suffer from severe factual hallucinations due to timeliness, accuracy, and coverage of parametric knowledge. Meanwhile, integrating reasoning with retrieval-augmented generation (RAG) remains challenging due to ineffective task decomposition and redundant retrieval, which can introduce noise and degrade response quality. In this paper, we propose DeepRAG, a framework that models retrieval-augmented reasoning as a Markov Decision Process (MDP), enabling strategic and adaptive retrieval. By iteratively decomposing queries, DeepRAG dynamically determines whether to retrieve external knowledge or rely on parametric reasoning at each step. Experiments show that DeepRAG improves retrieval efficiency while improving answer accuracy by 21.99%, demonstrating its effectiveness in optimizing retrieval-augmented reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01142v1-abstract-full').style.display = 'none'; document.getElementById('2502.01142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18795">arXiv:2501.18795</a> <span> [<a href="https://arxiv.org/pdf/2501.18795">pdf</a>, <a href="https://arxiv.org/format/2501.18795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Rope to Nope and Back Again: A New Hybrid Attention Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bowen Yang</a>, <a href="/search/cs?searchtype=author&query=Venkitesh%2C+B">Bharat Venkitesh</a>, <a href="/search/cs?searchtype=author&query=Talupuru%2C+D">Dwarak Talupuru</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hangyu Lin</a>, <a href="/search/cs?searchtype=author&query=Cairuz%2C+D">David Cairuz</a>, <a href="/search/cs?searchtype=author&query=Blunsom%2C+P">Phil Blunsom</a>, <a href="/search/cs?searchtype=author&query=Locatelli%2C+A">Acyr Locatelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18795v1-abstract-short" style="display: inline;"> Long-context large language models (LLMs) have achieved remarkable advancements, driven by techniques like Rotary Position Embedding (RoPE) (Su et al., 2023) and its extensions (Chen et al., 2023; Liu et al., 2024c; Peng et al., 2023). By adjusting RoPE parameters and incorporating training data with extended contexts, we can train performant models with considerably longer input sequences. Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18795v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18795v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18795v1-abstract-full" style="display: none;"> Long-context large language models (LLMs) have achieved remarkable advancements, driven by techniques like Rotary Position Embedding (RoPE) (Su et al., 2023) and its extensions (Chen et al., 2023; Liu et al., 2024c; Peng et al., 2023). By adjusting RoPE parameters and incorporating training data with extended contexts, we can train performant models with considerably longer input sequences. However, existing RoPE-based methods exhibit performance limitations when applied to extended context lengths. This paper presents a comprehensive analysis of various attention mechanisms, including RoPE, No Positional Embedding (NoPE), and Query-Key Normalization (QK-Norm), identifying their strengths and shortcomings in long-context modeling. Our investigation identifies distinctive attention patterns in these methods and highlights their impact on long-context performance, providing valuable insights for architectural design. Building on these findings, we propose a novel architectural based on a hybrid attention mechanism that not only surpasses conventional RoPE-based transformer models in long context tasks but also achieves competitive performance on benchmarks requiring shorter context lengths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18795v1-abstract-full').style.display = 'none'; document.getElementById('2501.18795v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17183">arXiv:2501.17183</a> <span> [<a href="https://arxiv.org/pdf/2501.17183">pdf</a>, <a href="https://arxiv.org/format/2501.17183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM Evaluation Based on Aerospace Manufacturing Expertise: Automated Generation and Multi-Model Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+B">Beiming Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhizhuo Cui</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Siteng Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohua Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haifeng Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengxin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17183v2-abstract-short" style="display: inline;"> Aerospace manufacturing demands exceptionally high precision in technical parameters. The remarkable performance of Large Language Models (LLMs), such as GPT-4 and QWen, in Natural Language Processing has sparked industry interest in their application to tasks including process design, material selection, and tool information retrieval. However, LLMs are prone to generating "hallucinations" in spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17183v2-abstract-full').style.display = 'inline'; document.getElementById('2501.17183v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17183v2-abstract-full" style="display: none;"> Aerospace manufacturing demands exceptionally high precision in technical parameters. The remarkable performance of Large Language Models (LLMs), such as GPT-4 and QWen, in Natural Language Processing has sparked industry interest in their application to tasks including process design, material selection, and tool information retrieval. However, LLMs are prone to generating "hallucinations" in specialized domains, producing inaccurate or false information that poses significant risks to the quality of aerospace products and flight safety. This paper introduces a set of evaluation metrics tailored for LLMs in aerospace manufacturing, aiming to assess their accuracy by analyzing their performance in answering questions grounded in professional knowledge. Firstly, key information is extracted through in-depth textual analysis of classic aerospace manufacturing textbooks and guidelines. Subsequently, utilizing LLM generation techniques, we meticulously construct multiple-choice questions with multiple correct answers of varying difficulty. Following this, different LLM models are employed to answer these questions, and their accuracy is recorded. Experimental results demonstrate that the capabilities of LLMs in aerospace professional knowledge are in urgent need of improvement. This study provides a theoretical foundation and practical guidance for the application of LLMs in aerospace manufacturing, addressing a critical gap in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17183v2-abstract-full').style.display = 'none'; document.getElementById('2501.17183v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference paper</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50; 90B30 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15968">arXiv:2501.15968</a> <span> [<a href="https://arxiv.org/pdf/2501.15968">pdf</a>, <a href="https://arxiv.org/format/2501.15968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-View Attention Syntactic Enhanced Graph Convolutional Network for Aspect-based Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiang Huang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shuo Sun</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Z">Zhifeng Hao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuhai Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15968v1-abstract-short" style="display: inline;"> Aspect-based Sentiment Analysis (ABSA) is the task aimed at predicting the sentiment polarity of aspect words within sentences. Recently, incorporating graph neural networks (GNNs) to capture additional syntactic structure information in the dependency tree derived from syntactic dependency parsing has been proven to be an effective paradigm for boosting ABSA. Despite GNNs enhancing model capabili… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15968v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15968v1-abstract-full" style="display: none;"> Aspect-based Sentiment Analysis (ABSA) is the task aimed at predicting the sentiment polarity of aspect words within sentences. Recently, incorporating graph neural networks (GNNs) to capture additional syntactic structure information in the dependency tree derived from syntactic dependency parsing has been proven to be an effective paradigm for boosting ABSA. Despite GNNs enhancing model capability by fusing more types of information, most works only utilize a single topology view of the dependency tree or simply conflate different perspectives of information without distinction, which limits the model performance. To address these challenges, in this paper, we propose a new multi-view attention syntactic enhanced graph convolutional network (MASGCN) that weighs different syntactic information of views using attention mechanisms. Specifically, we first construct distance mask matrices from the dependency tree to obtain multiple subgraph views for GNNs. To aggregate features from different views, we propose a multi-view attention mechanism to calculate the attention weights of views. Furthermore, to incorporate more syntactic information, we fuse the dependency type information matrix into the adjacency matrices and present a structural entropy loss to learn the dependency type adjacency matrix. Comprehensive experiments on four benchmark datasets demonstrate that our model outperforms state-of-the-art methods. The codes and datasets are available at https://github.com/SELGroup/MASGCN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15968v1-abstract-full').style.display = 'none'; document.getElementById('2501.15968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by DASFAA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15451">arXiv:2501.15451</a> <span> [<a href="https://arxiv.org/pdf/2501.15451">pdf</a>, <a href="https://arxiv.org/format/2501.15451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> STATE ToxiCN: A Benchmark for Span-level Target-Aware Toxicity Extraction in Chinese Hate Speech Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+Z">Zewen Bai</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuanyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shengdi Yin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junyu Lu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+J">Jingjie Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haohao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liang Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongfei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15451v2-abstract-short" style="display: inline;"> The proliferation of hate speech has caused significant harm to society. The intensity and directionality of hate are closely tied to the target and argument it is associated with. However, research on hate speech detection in Chinese has lagged behind, and existing datasets lack span-level fine-grained annotations. Furthermore, the lack of research on Chinese hateful slang poses a significant cha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15451v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15451v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15451v2-abstract-full" style="display: none;"> The proliferation of hate speech has caused significant harm to society. The intensity and directionality of hate are closely tied to the target and argument it is associated with. However, research on hate speech detection in Chinese has lagged behind, and existing datasets lack span-level fine-grained annotations. Furthermore, the lack of research on Chinese hateful slang poses a significant challenge. In this paper, we provide a solution for fine-grained detection of Chinese hate speech. First, we construct a dataset containing Target-Argument-Hateful-Group quadruples (STATE ToxiCN), which is the first span-level Chinese hate speech dataset. Secondly, we evaluate the span-level hate speech detection performance of existing models using STATE ToxiCN. Finally, we conduct the first study on Chinese hateful slang and evaluate the ability of LLMs to detect such expressions. Our work contributes valuable resources and insights to advance span-level hate speech detection in Chinese. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15451v2-abstract-full').style.display = 'none'; document.getElementById('2501.15451v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14302">arXiv:2501.14302</a> <span> [<a href="https://arxiv.org/pdf/2501.14302">pdf</a>, <a href="https://arxiv.org/format/2501.14302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TD-RD: A Top-Down Benchmark with Real-Time Framework for Road Damage Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xi Xiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengji Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wentao Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jiacheng Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Houjie Lin</a>, <a href="/search/cs?searchtype=author&query=Roy%2C+S+K">Swalpa Kumar Roy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Min Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14302v1-abstract-short" style="display: inline;"> Object detection has witnessed remarkable advancements over the past decade, largely driven by breakthroughs in deep learning and the proliferation of large scale datasets. However, the domain of road damage detection remains relatively under explored, despite its critical significance for applications such as infrastructure maintenance and road safety. This paper addresses this gap by introducing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14302v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14302v1-abstract-full" style="display: none;"> Object detection has witnessed remarkable advancements over the past decade, largely driven by breakthroughs in deep learning and the proliferation of large scale datasets. However, the domain of road damage detection remains relatively under explored, despite its critical significance for applications such as infrastructure maintenance and road safety. This paper addresses this gap by introducing a novel top down benchmark that offers a complementary perspective to existing datasets, specifically tailored for road damage detection. Our proposed Top Down Road Damage Detection Dataset (TDRD) includes three primary categories of road damage cracks, potholes, and patches captured from a top down viewpoint. The dataset consists of 7,088 high resolution images, encompassing 12,882 annotated instances of road damage. Additionally, we present a novel real time object detection framework, TDYOLOV10, designed to handle the unique challenges posed by the TDRD dataset. Comparative studies with state of the art models demonstrate competitive baseline results. By releasing TDRD, we aim to accelerate research in this crucial area. A sample of the dataset will be made publicly available upon the paper's acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14302v1-abstract-full').style.display = 'none'; document.getElementById('2501.14302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14216">arXiv:2501.14216</a> <span> [<a href="https://arxiv.org/pdf/2501.14216">pdf</a>, <a href="https://arxiv.org/format/2501.14216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> TFG-Flow: Training-free Guidance in Multimodal Generative Flow </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haowei Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shanda Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Haotian Ye</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yiming Yang</a>, <a href="/search/cs?searchtype=author&query=Ermon%2C+S">Stefano Ermon</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yitao Liang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jianzhu Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14216v1-abstract-short" style="display: inline;"> Given an unconditional generative model and a predictor for a target property (e.g., a classifier), the goal of training-free guidance is to generate samples with desirable target properties without additional training. As a highly efficient technique for steering generative models toward flexible outcomes, training-free guidance has gained increasing attention in diffusion models. However, existi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14216v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14216v1-abstract-full" style="display: none;"> Given an unconditional generative model and a predictor for a target property (e.g., a classifier), the goal of training-free guidance is to generate samples with desirable target properties without additional training. As a highly efficient technique for steering generative models toward flexible outcomes, training-free guidance has gained increasing attention in diffusion models. However, existing methods only handle data in continuous spaces, while many scientific applications involve both continuous and discrete data (referred to as multimodality). Another emerging trend is the growing use of the simple and general flow matching framework in building generative foundation models, where guided generation remains under-explored. To address this, we introduce TFG-Flow, a novel training-free guidance method for multimodal generative flow. TFG-Flow addresses the curse-of-dimensionality while maintaining the property of unbiased sampling in guiding discrete variables. We validate TFG-Flow on four molecular design tasks and show that TFG-Flow has great potential in drug design by generating molecules with desired properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14216v1-abstract-full').style.display = 'none'; document.getElementById('2501.14216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICLR 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12393">arXiv:2501.12393</a> <span> [<a href="https://arxiv.org/pdf/2501.12393">pdf</a>, <a href="https://arxiv.org/format/2501.12393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Affordance-Aware Articulation Synthesis for Rigged Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yu-Chu Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C+H">Chieh Hubert Lin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hsin-Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12393v1-abstract-short" style="display: inline;"> Rigged objects are commonly used in artist pipelines, as they can flexibly adapt to different scenes and postures. However, articulating the rigs into realistic affordance-aware postures (e.g., following the context, respecting the physics and the personalities of the object) remains time-consuming and heavily relies on human labor from experienced artists. In this paper, we tackle the novel probl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12393v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12393v1-abstract-full" style="display: none;"> Rigged objects are commonly used in artist pipelines, as they can flexibly adapt to different scenes and postures. However, articulating the rigs into realistic affordance-aware postures (e.g., following the context, respecting the physics and the personalities of the object) remains time-consuming and heavily relies on human labor from experienced artists. In this paper, we tackle the novel problem and design A3Syn. With a given context, such as the environment mesh and a text prompt of the desired posture, A3Syn synthesizes articulation parameters for arbitrary and open-domain rigged objects obtained from the Internet. The task is incredibly challenging due to the lack of training data, and we do not make any topological assumptions about the open-domain rigs. We propose using 2D inpainting diffusion model and several control techniques to synthesize in-context affordance information. Then, we develop an efficient bone correspondence alignment using a combination of differentiable rendering and semantic correspondence. A3Syn has stable convergence, completes in minutes, and synthesizes plausible affordance on different combinations of in-the-wild object rigs and scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12393v1-abstract-full').style.display = 'none'; document.getElementById('2501.12393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://chuyu.org/research/a3syn</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10870">arXiv:2501.10870</a> <span> [<a href="https://arxiv.org/pdf/2501.10870">pdf</a>, <a href="https://arxiv.org/format/2501.10870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Model-Robust and Adaptive-Optimal Transfer Learning for Tackling Concept Shifts in Nonparametric Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haotian Lin</a>, <a href="/search/cs?searchtype=author&query=Reimherr%2C+M">Matthew Reimherr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10870v1-abstract-short" style="display: inline;"> When concept shifts and sample scarcity are present in the target domain of interest, nonparametric regression learners often struggle to generalize effectively. The technique of transfer learning remedies these issues by leveraging data or pre-trained models from similar source domains. While existing generalization analyses of kernel-based transfer learning typically rely on correctly specified… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10870v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10870v1-abstract-full" style="display: none;"> When concept shifts and sample scarcity are present in the target domain of interest, nonparametric regression learners often struggle to generalize effectively. The technique of transfer learning remedies these issues by leveraging data or pre-trained models from similar source domains. While existing generalization analyses of kernel-based transfer learning typically rely on correctly specified models, we present a transfer learning procedure that is robust against model misspecification while adaptively attaining optimality. To facilitate our analysis and avoid the risk of saturation found in classical misspecified results, we establish a novel result in the misspecified single-task learning setting, showing that spectral algorithms with fixed bandwidth Gaussian kernels can attain minimax convergence rates given the true function is in a Sobolev space, which may be of independent interest. Building on this, we derive the adaptive convergence rates of the excess risk for specifying Gaussian kernels in a prevalent class of hypothesis transfer learning algorithms. Our results are minimax optimal up to logarithmic factors and elucidate the key determinants of transfer efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10870v1-abstract-full').style.display = 'none'; document.getElementById('2501.10870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05901">arXiv:2501.05901</a> <span> [<a href="https://arxiv.org/pdf/2501.05901">pdf</a>, <a href="https://arxiv.org/format/2501.05901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Valley2: Exploring Multimodal Models with Scalable Vision-Language Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Ziheng Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenghao Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ruipu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Can Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhentao He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xian Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haoran Lin</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Minghui Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05901v2-abstract-short" style="display: inline;"> Recently, vision-language models have made remarkable progress, demonstrating outstanding capabilities in various tasks such as image captioning and video understanding. We introduce Valley2, a novel multimodal large language model designed to enhance performance across all domains and extend the boundaries of practical applications in e-commerce and short video scenarios. Notably, Valley2 achieve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05901v2-abstract-full').style.display = 'inline'; document.getElementById('2501.05901v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05901v2-abstract-full" style="display: none;"> Recently, vision-language models have made remarkable progress, demonstrating outstanding capabilities in various tasks such as image captioning and video understanding. We introduce Valley2, a novel multimodal large language model designed to enhance performance across all domains and extend the boundaries of practical applications in e-commerce and short video scenarios. Notably, Valley2 achieves state-of-the-art (SOTA) performance on e-commerce benchmarks, surpassing open-source models of similar size by a large margin (79.66 vs. 72.76). Additionally, Valley2 ranks second on the OpenCompass leaderboard among models with fewer than 10B parameters, with an impressive average score of 67.4. The code and model weights are open-sourced at https://github.com/bytedance/Valley. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05901v2-abstract-full').style.display = 'none'; document.getElementById('2501.05901v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05663">arXiv:2501.05663</a> <span> [<a href="https://arxiv.org/pdf/2501.05663">pdf</a>, <a href="https://arxiv.org/format/2501.05663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Learning to Measure Quantum Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+H">Huan-Hsin Tseng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hsin-Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05663v1-abstract-short" style="display: inline;"> The rapid progress in quantum computing (QC) and machine learning (ML) has attracted growing attention, prompting extensive research into quantum machine learning (QML) algorithms to solve diverse and complex problems. Designing high-performance QML models demands expert-level proficiency, which remains a significant obstacle to the broader adoption of QML. A few major hurdles include crafting eff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05663v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05663v1-abstract-full" style="display: none;"> The rapid progress in quantum computing (QC) and machine learning (ML) has attracted growing attention, prompting extensive research into quantum machine learning (QML) algorithms to solve diverse and complex problems. Designing high-performance QML models demands expert-level proficiency, which remains a significant obstacle to the broader adoption of QML. A few major hurdles include crafting effective data encoding techniques and parameterized quantum circuits, both of which are crucial to the performance of QML models. Additionally, the measurement phase is frequently overlooked-most current QML models rely on pre-defined measurement protocols that often fail to account for the specific problem being addressed. We introduce a novel approach that makes the observable of the quantum system-specifically, the Hermitian matrix-learnable. Our method features an end-to-end differentiable learning framework, where the parameterized observable is trained alongside the ordinary quantum circuit parameters simultaneously. Using numerical simulations, we show that the proposed method can identify observables for variational quantum circuits that lead to improved outcomes, such as higher classification accuracy, thereby boosting the overall performance of QML models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05663v1-abstract-full').style.display = 'none'; document.getElementById('2501.05663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025 Workshop: Quantum Machine Learning in Signal Processing and Artificial Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03936">arXiv:2501.03936</a> <span> [<a href="https://arxiv.org/pdf/2501.03936">pdf</a>, <a href="https://arxiv.org/format/2501.03936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PPTAgent: Generating and Evaluating Presentations Beyond Text-to-Slides </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hao Zheng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xinyan Guan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+H">Hao Kong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jia Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03936v1-abstract-short" style="display: inline;"> Automatically generating presentations from documents is a challenging task that requires balancing content quality, visual design, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, often overlooking visual design and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03936v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03936v1-abstract-full" style="display: none;"> Automatically generating presentations from documents is a challenging task that requires balancing content quality, visual design, and structural coherence. Existing methods primarily focus on improving and evaluating the content quality in isolation, often overlooking visual design and structural coherence, which limits their practical applicability. To address these limitations, we propose PPTAgent, which comprehensively improves presentation generation through a two-stage, edit-based approach inspired by human workflows. PPTAgent first analyzes reference presentations to understand their structural patterns and content schemas, then drafts outlines and generates slides through code actions to ensure consistency and alignment. To comprehensively evaluate the quality of generated presentations, we further introduce PPTEval, an evaluation framework that assesses presentations across three dimensions: Content, Design, and Coherence. Experiments show that PPTAgent significantly outperforms traditional automatic presentation generation methods across all three dimensions. The code and data are available at https://github.com/icip-cas/PPTAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03936v1-abstract-full').style.display = 'none'; document.getElementById('2501.03936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02474">arXiv:2501.02474</a> <span> [<a href="https://arxiv.org/pdf/2501.02474">pdf</a>, <a href="https://arxiv.org/format/2501.02474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalization-Enhanced Few-Shot Object Detection in Remote Sensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+P">Pengjuan Yao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kexin Dong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuhan Guo</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Danfeng Hong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Congcong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02474v1-abstract-short" style="display: inline;"> Remote sensing object detection is particularly challenging due to the high resolution, multi-scale features, and diverse ground object characteristics inherent in satellite and UAV imagery. These challenges necessitate more advanced approaches for effective object detection in such environments. While deep learning methods have achieved remarkable success in remote sensing object detection, they… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02474v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02474v1-abstract-full" style="display: none;"> Remote sensing object detection is particularly challenging due to the high resolution, multi-scale features, and diverse ground object characteristics inherent in satellite and UAV imagery. These challenges necessitate more advanced approaches for effective object detection in such environments. While deep learning methods have achieved remarkable success in remote sensing object detection, they typically rely on large amounts of labeled data. Acquiring sufficient labeled data, particularly for novel or rare objects, is both challenging and time-consuming in remote sensing scenarios, limiting the generalization capabilities of existing models. To address these challenges, few-shot learning (FSL) has emerged as a promising approach, aiming to enable models to learn new classes from limited labeled examples. Building on this concept, few-shot object detection (FSOD) specifically targets object detection challenges in data-limited conditions. However, the generalization capability of FSOD models, particularly in remote sensing, is often constrained by the complex and diverse characteristics of the objects present in such environments. In this paper, we propose the Generalization-Enhanced Few-Shot Object Detection (GE-FSOD) model to improve the generalization capability in remote sensing FSOD tasks. Our model introduces three key innovations: the Cross-Level Fusion Pyramid Attention Network (CFPAN) for enhanced multi-scale feature representation, the Multi-Stage Refinement Region Proposal Network (MRRPN) for more accurate region proposals, and the Generalized Classification Loss (GCL) for improved classification performance in few-shot scenarios. Extensive experiments on the DIOR and NWPU VHR-10 datasets show that our model achieves state-of-the-art performance for few-shot object detection in remote sensing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02474v1-abstract-full').style.display = 'none'; document.getElementById('2501.02474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Circuits and Systems for Video Technology (TCSVT), 2015 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02461">arXiv:2501.02461</a> <span> [<a href="https://arxiv.org/pdf/2501.02461">pdf</a>, <a href="https://arxiv.org/format/2501.02461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FedRSClip: Federated Learning for Remote Sensing Scene Classification Using Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hui Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chao Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+D">Danfeng Hong</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kexin Dong</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Congcong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02461v1-abstract-short" style="display: inline;"> Remote sensing data is often distributed across multiple institutions, and due to privacy concerns and data-sharing restrictions, leveraging large-scale datasets in a centralized training framework is challenging. Federated learning offers a promising solution by enabling collaborative model training across distributed data sources without requiring data centralization. However, current Vision-Lan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02461v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02461v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02461v1-abstract-full" style="display: none;"> Remote sensing data is often distributed across multiple institutions, and due to privacy concerns and data-sharing restrictions, leveraging large-scale datasets in a centralized training framework is challenging. Federated learning offers a promising solution by enabling collaborative model training across distributed data sources without requiring data centralization. However, current Vision-Language Models (VLMs), which typically contain billions of parameters, pose significant communication challenges for traditional federated learning approaches based on model parameter updates, as they would incur substantial communication costs. In this paper, we propose FedRSCLIP, the first federated learning framework designed for remote sensing image classification based on a VLM, specifically CLIP. FedRSCLIP addresses the challenges of data heterogeneity and large-scale model transmission in federated environments by introducing Prompt Learning, which optimizes only a small set of tunable parameters. The framework introduces a dual-prompt mechanism, comprising Shared Prompts for global knowledge sharing and Private Prompts for client-specific adaptation. To maintain semantic coherence between shared and private prompts, we propose the Dual Prompt Alignment Constraint to balance global consistency and local adaptability across diverse client distributions. Additionally, to enhance cross-modal representation learning, we introduce the Cross-Modal Feature Alignment Constraint to align multimodal features between text and image prompts. To validate the effectiveness of our proposed model, we construct a Fed-RSIC dataset based on three existing remote sensing image classification datasets, specifically designed to simulate various federated learning configurations. Experimental results demonstrate the effectiveness and superiority of FedRSCLIP in remote sensing image classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02461v1-abstract-full').style.display = 'none'; document.getElementById('2501.02461v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01957">arXiv:2501.01957</a> <span> [<a href="https://arxiv.org/pdf/2501.01957">pdf</a>, <a href="https://arxiv.org/format/2501.01957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> VITA-1.5: Towards GPT-4o Level Real-Time Vision and Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haojia Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yi-Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yunhang Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Haoyu Cao</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Z">Zuwei Long</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Heting Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Long Ma</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiawu Zheng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+R">Rongrong Ji</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xing Sun</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+C">Caifeng Shan</a>, <a href="/search/cs?searchtype=author&query=He%2C+R">Ran He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01957v3-abstract-short" style="display: inline;"> Recent Multimodal Large Language Models (MLLMs) have typically focused on integrating visual and textual modalities, with less emphasis placed on the role of speech in enhancing interaction. However, speech plays a crucial role in multimodal dialogue systems, and implementing high-performance in both vision and speech tasks remains a significant challenge due to the fundamental modality difference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01957v3-abstract-full').style.display = 'inline'; document.getElementById('2501.01957v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01957v3-abstract-full" style="display: none;"> Recent Multimodal Large Language Models (MLLMs) have typically focused on integrating visual and textual modalities, with less emphasis placed on the role of speech in enhancing interaction. However, speech plays a crucial role in multimodal dialogue systems, and implementing high-performance in both vision and speech tasks remains a significant challenge due to the fundamental modality differences. In this paper, we propose a carefully designed multi-stage training methodology that progressively trains LLM to understand both visual and speech information, ultimately enabling fluent vision and speech interaction. Our approach not only preserves strong vision-language capacity, but also enables efficient speech-to-speech dialogue capabilities without separate ASR and TTS modules, significantly accelerating multimodal end-to-end response speed. By comparing our method against state-of-the-art counterparts across benchmarks for image, video, and speech tasks, we demonstrate that our model is equipped with both strong visual and speech capabilities, making near real-time vision and speech interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01957v3-abstract-full').style.display = 'none'; document.getElementById('2501.01957v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://github.com/VITA-MLLM/VITA (2K+ Stars by now)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01835">arXiv:2501.01835</a> <span> [<a href="https://arxiv.org/pdf/2501.01835">pdf</a>, <a href="https://arxiv.org/format/2501.01835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ASKCOS: an open source software suite for synthesis planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhengkai Tu</a>, <a href="/search/cs?searchtype=author&query=Choure%2C+S+J">Sourabh J. Choure</a>, <a href="/search/cs?searchtype=author&query=Fong%2C+M+H">Mun Hong Fong</a>, <a href="/search/cs?searchtype=author&query=Roh%2C+J">Jihye Roh</a>, <a href="/search/cs?searchtype=author&query=Levin%2C+I">Itai Levin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kevin Yu</a>, <a href="/search/cs?searchtype=author&query=Joung%2C+J+F">Joonyoung F. Joung</a>, <a href="/search/cs?searchtype=author&query=Morgan%2C+N">Nathan Morgan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shih-Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiaoqi Sun</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huiqian Lin</a>, <a href="/search/cs?searchtype=author&query=Murnin%2C+M">Mark Murnin</a>, <a href="/search/cs?searchtype=author&query=Liles%2C+J+P">Jordan P. Liles</a>, <a href="/search/cs?searchtype=author&query=Struble%2C+T+J">Thomas J. Struble</a>, <a href="/search/cs?searchtype=author&query=Fortunato%2C+M+E">Michael E. Fortunato</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengjie Liu</a>, <a href="/search/cs?searchtype=author&query=Green%2C+W+H">William H. Green</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+K+F">Klavs F. Jensen</a>, <a href="/search/cs?searchtype=author&query=Coley%2C+C+W">Connor W. Coley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01835v1-abstract-short" style="display: inline;"> The advancement of machine learning and the availability of large-scale reaction datasets have accelerated the development of data-driven models for computer-aided synthesis planning (CASP) in the past decade. Here, we detail the newest version of ASKCOS, an open source software suite for synthesis planning that makes available several research advances in a freely available, practical tool. Four… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01835v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01835v1-abstract-full" style="display: none;"> The advancement of machine learning and the availability of large-scale reaction datasets have accelerated the development of data-driven models for computer-aided synthesis planning (CASP) in the past decade. Here, we detail the newest version of ASKCOS, an open source software suite for synthesis planning that makes available several research advances in a freely available, practical tool. Four one-step retrosynthesis models form the basis of both interactive planning and automatic planning modes. Retrosynthetic planning is complemented by other modules for feasibility assessment and pathway evaluation, including reaction condition recommendation, reaction outcome prediction, and auxiliary capabilities such as solubility prediction and quantum mechanical descriptor prediction. ASKCOS has assisted hundreds of medicinal, synthetic, and process chemists in their day-to-day tasks, complementing expert decision making. It is our belief that CASP tools like ASKCOS are an important part of modern chemistry research, and that they offer ever-increasing utility and accessibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01835v1-abstract-full').style.display = 'none'; document.getElementById('2501.01835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01830">arXiv:2501.01830</a> <span> [<a href="https://arxiv.org/pdf/2501.01830">pdf</a>, <a href="https://arxiv.org/format/2501.01830">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Auto-RT: Automatic Jailbreak Strategy Exploration for Red-Teaming Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanjiang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuhen Zhou</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Huijia Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01830v1-abstract-short" style="display: inline;"> Automated red-teaming has become a crucial approach for uncovering vulnerabilities in large language models (LLMs). However, most existing methods focus on isolated safety flaws, limiting their ability to adapt to dynamic defenses and uncover complex vulnerabilities efficiently. To address this challenge, we propose Auto-RT, a reinforcement learning framework that automatically explores and optimi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01830v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01830v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01830v1-abstract-full" style="display: none;"> Automated red-teaming has become a crucial approach for uncovering vulnerabilities in large language models (LLMs). However, most existing methods focus on isolated safety flaws, limiting their ability to adapt to dynamic defenses and uncover complex vulnerabilities efficiently. To address this challenge, we propose Auto-RT, a reinforcement learning framework that automatically explores and optimizes complex attack strategies to effectively uncover security vulnerabilities through malicious queries. Specifically, we introduce two key mechanisms to reduce exploration complexity and improve strategy optimization: 1) Early-terminated Exploration, which accelerate exploration by focusing on high-potential attack strategies; and 2) Progressive Reward Tracking algorithm with intermediate downgrade models, which dynamically refine the search trajectory toward successful vulnerability exploitation. Extensive experiments across diverse LLMs demonstrate that, by significantly improving exploration efficiency and automatically optimizing attack strategies, Auto-RT detects a boarder range of vulnerabilities, achieving a faster detection speed and 16.63\% higher success rates compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01830v1-abstract-full').style.display = 'none'; document.getElementById('2501.01830v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01507">arXiv:2501.01507</a> <span> [<a href="https://arxiv.org/pdf/2501.01507">pdf</a>, <a href="https://arxiv.org/format/2501.01507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transfer Learning Analysis of Variational Quantum Circuits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tseng%2C+H">Huan-Hsin Tseng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hsin-Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+S">Shinjae Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01507v1-abstract-short" style="display: inline;"> This work analyzes transfer learning of the Variational Quantum Circuit (VQC). Our framework begins with a pretrained VQC configured in one domain and calculates the transition of 1-parameter unitary subgroups required for a new domain. A formalism is established to investigate the adaptability and capability of a VQC under the analysis of loss bounds. Our theory observes knowledge transfer in VQC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01507v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01507v1-abstract-full" style="display: none;"> This work analyzes transfer learning of the Variational Quantum Circuit (VQC). Our framework begins with a pretrained VQC configured in one domain and calculates the transition of 1-parameter unitary subgroups required for a new domain. A formalism is established to investigate the adaptability and capability of a VQC under the analysis of loss bounds. Our theory observes knowledge transfer in VQCs and provides a heuristic interpretation for the mechanism. An analytical fine-tuning method is derived to attain the optimal transition for adaptations of similar domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01507v1-abstract-full').style.display = 'none'; document.getElementById('2501.01507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00825">arXiv:2501.00825</a> <span> [<a href="https://arxiv.org/pdf/2501.00825">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1080/2331186X.2023.2245637">10.1080/2331186X.2023.2245637 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Personalized Programming Education: Using Machine Learning to Boost Learning Performance Based on Students' Personality Traits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tseng%2C+C">Chun-Hsiung Tseng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H+K">Hao-Chiang Koong Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+A+C">Andrew Chih-Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jia-Rou Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00825v1-abstract-short" style="display: inline;"> Studies have indicated that personality is related to achievement, and several personality assessment models have been developed. However, most are either questionnaires or based on marker systems, which entails limitations. We proposed a physiological signal based model, thereby ensuring the objectivity of the data and preventing unreliable responses. Thirty participants were recruited from the D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00825v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00825v1-abstract-full" style="display: none;"> Studies have indicated that personality is related to achievement, and several personality assessment models have been developed. However, most are either questionnaires or based on marker systems, which entails limitations. We proposed a physiological signal based model, thereby ensuring the objectivity of the data and preventing unreliable responses. Thirty participants were recruited from the Department of Electrical Engineering of Yuan Ze University in Taiwan. Wearable sensors were used to collect physiological signals as the participants watched and summarized a video. They then completed a personality questionnaire based on the big five factor markers system. The results were used to construct a personality prediction model, which revealed that galvanic skin response and heart rate variance were key factors predicting extroversion; heart rate variance also predicted agreeableness and conscientiousness. The results of this experiment can elucidate students personality traits, which can help educators select the appropriate pedagogical methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00825v1-abstract-full').style.display = 'none'; document.getElementById('2501.00825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00449">arXiv:2501.00449</a> <span> [<a href="https://arxiv.org/pdf/2501.00449">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1080/2331186X.2022.2138052">10.1080/2331186X.2022.2138052 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Do Students with Different Personality Traits Demonstrate Different Physiological Signals in Video-based Learning? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tseng%2C+C">Chun-Hsiung Tseng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H+K">Hao-Chiang Koong Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yung-Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jia-Rou Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+A+C">Andrew Chih-Wei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00449v1-abstract-short" style="display: inline;"> Past researches show that personality trait is a strong predictor for ones academic performance. Today, mature and verified marker systems for assessing personality traits already exist. However, marker systems-based assessing methods have their own limitations. For example, dishonest responses cannot be avoided. In this research, the goal is to develop a method that can overcome the limitations.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00449v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00449v1-abstract-full" style="display: none;"> Past researches show that personality trait is a strong predictor for ones academic performance. Today, mature and verified marker systems for assessing personality traits already exist. However, marker systems-based assessing methods have their own limitations. For example, dishonest responses cannot be avoided. In this research, the goal is to develop a method that can overcome the limitations. The proposed method will rely on physiological signals for the assessment. Thirty participants have participated in this experiment. Based on the statistical results, we found that there are correlations between students personality traits and their physiological signal change when learning via videos. Specifically, we found that participants degree of extraversion, agreeableness, conscientiousness, and openness to experiences are correlated with the variance of heart rates, the variance of GSR values, and the skewness of voice frequencies, etc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00449v1-abstract-full').style.display = 'none'; document.getElementById('2501.00449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00334">arXiv:2501.00334</a> <span> [<a href="https://arxiv.org/pdf/2501.00334">pdf</a>, <a href="https://arxiv.org/format/2501.00334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Loss-Aware Curriculum Learning for Chinese Grammatical Error Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Ding Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yangning Li</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lichen Bai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinghui Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haiye Lin</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hai-Tao Zheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xin Su</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+Z">Zifei Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00334v1-abstract-short" style="display: inline;"> Chinese grammatical error correction (CGEC) aims to detect and correct errors in the input Chinese sentences. Recently, Pre-trained Language Models (PLMS) have been employed to improve the performance. However, current approaches ignore that correction difficulty varies across different instances and treat these samples equally, enhancing the challenge of model learning. To address this problem, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00334v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00334v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00334v1-abstract-full" style="display: none;"> Chinese grammatical error correction (CGEC) aims to detect and correct errors in the input Chinese sentences. Recently, Pre-trained Language Models (PLMS) have been employed to improve the performance. However, current approaches ignore that correction difficulty varies across different instances and treat these samples equally, enhancing the challenge of model learning. To address this problem, we propose a multi-granularity Curriculum Learning (CL) framework. Specifically, we first calculate the correction difficulty of these samples and feed them into the model from easy to hard batch by batch. Then Instance-Level CL is employed to help the model optimize in the appropriate direction automatically by regulating the loss function. Extensive experimental results and comprehensive analyses of various datasets prove the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00334v1-abstract-full').style.display = 'none'; document.getElementById('2501.00334v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00013">arXiv:2501.00013</a> <span> [<a href="https://arxiv.org/pdf/2501.00013">pdf</a>, <a href="https://arxiv.org/format/2501.00013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Relation-Aware Equivariant Graph Networks for Epitope-Unknown Antibody Design and Specificity Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haitao Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yufei Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangyang Gao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+C">Cheng Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yunfan Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tailin Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00013v1-abstract-short" style="display: inline;"> Antibodies are Y-shaped proteins that protect the host by binding to specific antigens, and their binding is mainly determined by the Complementary Determining Regions (CDRs) in the antibody. Despite the great progress made in CDR design, existing computational methods still encounter several challenges: 1) poor capability of modeling complex CDRs with long sequences due to insufficient contextual… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00013v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00013v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00013v1-abstract-full" style="display: none;"> Antibodies are Y-shaped proteins that protect the host by binding to specific antigens, and their binding is mainly determined by the Complementary Determining Regions (CDRs) in the antibody. Despite the great progress made in CDR design, existing computational methods still encounter several challenges: 1) poor capability of modeling complex CDRs with long sequences due to insufficient contextual information; 2) conditioned on pre-given antigenic epitopes and their static interaction with the target antibody; 3) neglect of specificity during antibody optimization leads to non-specific antibodies. In this paper, we take into account a variety of node features, edge features, and edge relations to include more contextual and geometric information. We propose a novel Relation-Aware Antibody Design (RAAD) framework, which dynamically models antigen-antibody interactions for co-designing the sequences and structures of antigen-specific CDRs. Furthermore, we propose a new evaluation metric to better measure antibody specificity and develop a contrasting specificity-enhancing constraint to optimize the specificity of antibodies. Extensive experiments have demonstrated the superior capability of RAAD in terms of antibody modeling, generation, and optimization across different CDR types, sequence lengths, pre-training strategies, and input contexts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00013v1-abstract-full').style.display = 'none'; document.getElementById('2501.00013v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20368">arXiv:2412.20368</a> <span> [<a href="https://arxiv.org/pdf/2412.20368">pdf</a>, <a href="https://arxiv.org/format/2412.20368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Subconscious Robotic Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jun Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jianwei Tan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huanxu Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaoguang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20368v1-abstract-short" style="display: inline;"> Although robotic imitation learning (RIL) is promising for embodied intelligent robots, existing RIL approaches rely on computationally intensive multi-model trajectory predictions, resulting in slow execution and limited real-time responsiveness. Instead, human beings subconscious can constantly process and store vast amounts of information from their experiences, perceptions, and learning, allow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20368v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20368v1-abstract-full" style="display: none;"> Although robotic imitation learning (RIL) is promising for embodied intelligent robots, existing RIL approaches rely on computationally intensive multi-model trajectory predictions, resulting in slow execution and limited real-time responsiveness. Instead, human beings subconscious can constantly process and store vast amounts of information from their experiences, perceptions, and learning, allowing them to fulfill complex actions such as riding a bike, without consciously thinking about each. Inspired by this phenomenon in action neurology, we introduced subconscious robotic imitation learning (SRIL), wherein cognitive offloading was combined with historical action chunkings to reduce delays caused by model inferences, thereby accelerating task execution. This process was further enhanced by subconscious downsampling and pattern augmented learning policy wherein intent-rich information was addressed with quantized sampling techniques to improve manipulation efficiency. Experimental results demonstrated that execution speeds of the SRIL were 100\% to 200\% faster over SOTA policies for comprehensive dual-arm tasks, with consistently higher success rates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20368v1-abstract-full').style.display = 'none'; document.getElementById('2412.20368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17920">arXiv:2412.17920</a> <span> [<a href="https://arxiv.org/pdf/2412.17920">pdf</a>, <a href="https://arxiv.org/format/2412.17920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Causal Composition Diffusion Model for Closed-loop Traffic Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haohong Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xin Huang</a>, <a href="/search/cs?searchtype=author&query=Phan-Minh%2C+T">Tung Phan-Minh</a>, <a href="/search/cs?searchtype=author&query=Hayden%2C+D+S">David S. Hayden</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Ding Zhao</a>, <a href="/search/cs?searchtype=author&query=Srinivasa%2C+S">Siddhartha Srinivasa</a>, <a href="/search/cs?searchtype=author&query=Wolff%2C+E+M">Eric M. Wolff</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongge Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17920v2-abstract-short" style="display: inline;"> Simulation is critical for safety evaluation in autonomous driving, particularly in capturing complex interactive behaviors. However, generating realistic and controllable traffic scenarios in long-tail situations remains a significant challenge. Existing generative models suffer from the conflicting objective between user-defined controllability and realism constraints, which is amplified in safe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17920v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17920v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17920v2-abstract-full" style="display: none;"> Simulation is critical for safety evaluation in autonomous driving, particularly in capturing complex interactive behaviors. However, generating realistic and controllable traffic scenarios in long-tail situations remains a significant challenge. Existing generative models suffer from the conflicting objective between user-defined controllability and realism constraints, which is amplified in safety-critical contexts. In this work, we introduce the Causal Compositional Diffusion Model (CCDiff), a structure-guided diffusion framework to address these challenges. We first formulate the learning of controllable and realistic closed-loop simulation as a constrained optimization problem. Then, CCDiff maximizes controllability while adhering to realism by automatically identifying and injecting causal structures directly into the diffusion process, providing structured guidance to enhance both realism and controllability. Through rigorous evaluations on benchmark datasets and in a closed-loop simulator, CCDiff demonstrates substantial gains over state-of-the-art approaches in generating realistic and user-preferred trajectories. Our results show CCDiff's effectiveness in extracting and leveraging causal structures, showing improved closed-loop performance based on key metrics such as collision rate, off-road rate, FDE, and comfort. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17920v2-abstract-full').style.display = 'none'; document.getElementById('2412.17920v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17007">arXiv:2412.17007</a> <span> [<a href="https://arxiv.org/pdf/2412.17007">pdf</a>, <a href="https://arxiv.org/format/2412.17007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Where am I? Cross-View Geo-localization with Natural Language Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Junyan Ye</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Honglin Lin</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+L">Leyan Ou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dairong Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weijia Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17007v1-abstract-short" style="display: inline;"> Cross-view geo-localization identifies the locations of street-view images by matching them with geo-tagged satellite images or OSM. However, most studies focus on image-to-image retrieval, with fewer addressing text-guided retrieval, a task vital for applications like pedestrian navigation and emergency response. In this work, we introduce a novel task for cross-view geo-localization with natural… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17007v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17007v1-abstract-full" style="display: none;"> Cross-view geo-localization identifies the locations of street-view images by matching them with geo-tagged satellite images or OSM. However, most studies focus on image-to-image retrieval, with fewer addressing text-guided retrieval, a task vital for applications like pedestrian navigation and emergency response. In this work, we introduce a novel task for cross-view geo-localization with natural language descriptions, which aims to retrieve corresponding satellite images or OSM database based on scene text. To support this task, we construct the CVG-Text dataset by collecting cross-view data from multiple cities and employing a scene text generation approach that leverages the annotation capabilities of Large Multimodal Models to produce high-quality scene text descriptions with localization details.Additionally, we propose a novel text-based retrieval localization method, CrossText2Loc, which improves recall by 10% and demonstrates excellent long-text retrieval capabilities. In terms of explainability, it not only provides similarity scores but also offers retrieval reasons. More information can be found at https://yejy53.github.io/CVG-Text/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17007v1-abstract-full').style.display = 'none'; document.getElementById('2412.17007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15503">arXiv:2412.15503</a> <span> [<a href="https://arxiv.org/pdf/2412.15503">pdf</a>, <a href="https://arxiv.org/format/2412.15503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Meme Trojan: Backdoor Attacks Against Hateful Meme Detection via Cross-Modal Triggers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruofei Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhan Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Ziyuan Luo</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+K+C">Ka Chun Cheung</a>, <a href="/search/cs?searchtype=author&query=See%2C+S">Simon See</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jing Ma</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Renjie Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15503v1-abstract-short" style="display: inline;"> Hateful meme detection aims to prevent the proliferation of hateful memes on various social media platforms. Considering its impact on social environments, this paper introduces a previously ignored but significant threat to hateful meme detection: backdoor attacks. By injecting specific triggers into meme samples, backdoor attackers can manipulate the detector to output their desired outcomes. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15503v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15503v1-abstract-full" style="display: none;"> Hateful meme detection aims to prevent the proliferation of hateful memes on various social media platforms. Considering its impact on social environments, this paper introduces a previously ignored but significant threat to hateful meme detection: backdoor attacks. By injecting specific triggers into meme samples, backdoor attackers can manipulate the detector to output their desired outcomes. To explore this, we propose the Meme Trojan framework to initiate backdoor attacks on hateful meme detection. Meme Trojan involves creating a novel Cross-Modal Trigger (CMT) and a learnable trigger augmentor to enhance the trigger pattern according to each input sample. Due to the cross-modal property, the proposed CMT can effectively initiate backdoor attacks on hateful meme detectors under an automatic application scenario. Additionally, the injection position and size of our triggers are adaptive to the texts contained in the meme, which ensures that the trigger is seamlessly integrated with the meme content. Our approach outperforms the state-of-the-art backdoor attack methods, showing significant improvements in effectiveness and stealthiness. We believe that this paper will draw more attention to the potential threat posed by backdoor attacks on hateful meme detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15503v1-abstract-full').style.display = 'none'; document.getElementById('2412.15503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15373">arXiv:2412.15373</a> <span> [<a href="https://arxiv.org/pdf/2412.15373">pdf</a>, <a href="https://arxiv.org/format/2412.15373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Granger Causality Detection with Kolmogorov-Arnold Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mohan Ren</a>, <a href="/search/cs?searchtype=author&query=Barucca%2C+P">Paolo Barucca</a>, <a href="/search/cs?searchtype=author&query=Aste%2C+T">Tomaso Aste</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15373v1-abstract-short" style="display: inline;"> Discovering causal relationships in time series data is central in many scientific areas, ranging from economics to climate science. Granger causality is a powerful tool for causality detection. However, its original formulation is limited by its linear form and only recently nonlinear machine-learning generalizations have been introduced. This study contributes to the definition of neural Granger… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15373v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15373v1-abstract-full" style="display: none;"> Discovering causal relationships in time series data is central in many scientific areas, ranging from economics to climate science. Granger causality is a powerful tool for causality detection. However, its original formulation is limited by its linear form and only recently nonlinear machine-learning generalizations have been introduced. This study contributes to the definition of neural Granger causality models by investigating the application of Kolmogorov-Arnold networks (KANs) in Granger causality detection and comparing their capabilities against multilayer perceptrons (MLP). In this work, we develop a framework called Granger Causality KAN (GC-KAN) along with a tailored training approach designed specifically for Granger causality detection. We test this framework on both Vector Autoregressive (VAR) models and chaotic Lorenz-96 systems, analysing the ability of KANs to sparsify input features by identifying Granger causal relationships, providing a concise yet accurate model for Granger causality detection. Our findings show the potential of KANs to outperform MLPs in discerning interpretable Granger causal relationships, particularly for the ability of identifying sparse Granger causality patterns in high-dimensional settings, and more generally, the potential of AI in causality discovery for the dynamical laws in physical systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15373v1-abstract-full').style.display = 'none'; document.getElementById('2412.15373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15115">arXiv:2412.15115</a> <span> [<a href="https://arxiv.org/pdf/2412.15115">pdf</a>, <a href="https://arxiv.org/format/2412.15115">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Qwen2.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qwen"> Qwen</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Beichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Hui%2C+B">Binyuan Hui</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haoran Wei</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huan Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jianhong Tu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiaxi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+K">Keqin Bao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kexin Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Le Yu</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15115v2-abstract-short" style="display: inline;"> In this report, we introduce Qwen2.5, a comprehensive series of large language models (LLMs) designed to meet diverse needs. Compared to previous iterations, Qwen 2.5 has been significantly improved during both the pre-training and post-training stages. In terms of pre-training, we have scaled the high-quality pre-training datasets from the previous 7 trillion tokens to 18 trillion tokens. This pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15115v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15115v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15115v2-abstract-full" style="display: none;"> In this report, we introduce Qwen2.5, a comprehensive series of large language models (LLMs) designed to meet diverse needs. Compared to previous iterations, Qwen 2.5 has been significantly improved during both the pre-training and post-training stages. In terms of pre-training, we have scaled the high-quality pre-training datasets from the previous 7 trillion tokens to 18 trillion tokens. This provides a strong foundation for common sense, expert knowledge, and reasoning capabilities. In terms of post-training, we implement intricate supervised finetuning with over 1 million samples, as well as multistage reinforcement learning. Post-training techniques enhance human preference, and notably improve long text generation, structural data analysis, and instruction following. To handle diverse and varied use cases effectively, we present Qwen2.5 LLM series in rich sizes. Open-weight offerings include base and instruction-tuned models, with quantized versions available. In addition, for hosted solutions, the proprietary models currently include two mixture-of-experts (MoE) variants: Qwen2.5-Turbo and Qwen2.5-Plus, both available from Alibaba Cloud Model Studio. Qwen2.5 has demonstrated top-tier performance on a wide range of benchmarks evaluating language understanding, reasoning, mathematics, coding, human preference alignment, etc. Specifically, the open-weight flagship Qwen2.5-72B-Instruct outperforms a number of open and proprietary models and demonstrates competitive performance to the state-of-the-art open-weight model, Llama-3-405B-Instruct, which is around 5 times larger. Qwen2.5-Turbo and Qwen2.5-Plus offer superior cost-effectiveness while performing competitively against GPT-4o-mini and GPT-4o respectively. Additionally, as the foundation, Qwen2.5 models have been instrumental in training specialized models such as Qwen2.5-Math, Qwen2.5-Coder, QwQ, and multimodal models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15115v2-abstract-full').style.display = 'none'; document.getElementById('2412.15115v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14015">arXiv:2412.14015</a> <span> [<a href="https://arxiv.org/pdf/2412.14015">pdf</a>, <a href="https://arxiv.org/format/2412.14015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompting Depth Anything for 4K Resolution Accurate Metric Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haotong Lin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Sida Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Songyou Peng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiaming Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghuan Liu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Hujun Bao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiashi Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiaowei Zhou</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+B">Bingyi Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14015v1-abstract-short" style="display: inline;"> Prompts play a critical role in unleashing the power of language and vision foundation models for specific tasks. For the first time, we introduce prompting into depth foundation models, creating a new paradigm for metric depth estimation termed Prompt Depth Anything. Specifically, we use a low-cost LiDAR as the prompt to guide the Depth Anything model for accurate metric depth output, achieving u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14015v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14015v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14015v1-abstract-full" style="display: none;"> Prompts play a critical role in unleashing the power of language and vision foundation models for specific tasks. For the first time, we introduce prompting into depth foundation models, creating a new paradigm for metric depth estimation termed Prompt Depth Anything. Specifically, we use a low-cost LiDAR as the prompt to guide the Depth Anything model for accurate metric depth output, achieving up to 4K resolution. Our approach centers on a concise prompt fusion design that integrates the LiDAR at multiple scales within the depth decoder. To address training challenges posed by limited datasets containing both LiDAR depth and precise GT depth, we propose a scalable data pipeline that includes synthetic data LiDAR simulation and real data pseudo GT depth generation. Our approach sets new state-of-the-arts on the ARKitScenes and ScanNet++ datasets and benefits downstream applications, including 3D reconstruction and generalized robotic grasping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14015v1-abstract-full').style.display = 'none'; document.getElementById('2412.14015v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://PromptDA.github.io/</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>