Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,155 results for author: <span class="mathjax">Lin, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lin%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lin, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lin%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lin, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18462">arXiv:2502.18462</a> <span> [<a href="https://arxiv.org/pdf/2502.18462">pdf</a>, <a href="https://arxiv.org/format/2502.18462">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scalable Equilibrium Sampling with Sequential Boltzmann Generators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+C+B">Charlie B. Tan</a>, <a href="/search/cs?searchtype=author&query=Bose%2C+A+J">Avishek Joey Bose</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chen Lin</a>, <a href="/search/cs?searchtype=author&query=Klein%2C+L">Leon Klein</a>, <a href="/search/cs?searchtype=author&query=Bronstein%2C+M+M">Michael M. Bronstein</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+A">Alexander Tong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18462v1-abstract-short" style="display: inline;"> Scalable sampling of molecular states in thermodynamic equilibrium is a long-standing challenge in statistical physics. Boltzmann generators tackle this problem by pairing powerful normalizing flows with importance sampling to obtain statistically independent samples under the target distribution. In this paper, we extend the Boltzmann generator framework and introduce Sequential Boltzmann generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18462v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18462v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18462v1-abstract-full" style="display: none;"> Scalable sampling of molecular states in thermodynamic equilibrium is a long-standing challenge in statistical physics. Boltzmann generators tackle this problem by pairing powerful normalizing flows with importance sampling to obtain statistically independent samples under the target distribution. In this paper, we extend the Boltzmann generator framework and introduce Sequential Boltzmann generators (SBG) with two key improvements. The first is a highly efficient non-equivariant Transformer-based normalizing flow operating directly on all-atom Cartesian coordinates. In contrast to equivariant continuous flows of prior methods, we leverage exactly invertible non-equivariant architectures which are highly efficient both during sample generation and likelihood computation. As a result, this unlocks more sophisticated inference strategies beyond standard importance sampling. More precisely, as a second key improvement we perform inference-time scaling of flow samples using annealed Langevin dynamics which transports samples toward the target distribution leading to lower variance (annealed) importance weights which enable higher fidelity resampling with sequential Monte Carlo. SBG achieves state-of-the-art performance w.r.t. all metrics on molecular systems, demonstrating the first equilibrium sampling in Cartesian coordinates of tri, tetra, and hexapeptides that were so far intractable for prior Boltzmann generators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18462v1-abstract-full').style.display = 'none'; document.getElementById('2502.18462v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.17445">arXiv:2502.17445</a> <span> [<a href="https://arxiv.org/pdf/2502.17445">pdf</a>, <a href="https://arxiv.org/format/2502.17445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Interpretable Dual-Filter Fuzzy Neural Networks for Affective Brain-Computer Interfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaowei Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanan Chen</a>, <a href="/search/cs?searchtype=author&query=Pal%2C+N+R">Nikhil Ranjan Pal</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yu-Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yunkai Yang</a>, <a href="/search/cs?searchtype=author&query=Do%2C+T">Thomas Do</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.17445v1-abstract-short" style="display: inline;"> Fuzzy logic provides a robust framework for enhancing explainability, particularly in domains requiring the interpretation of complex and ambiguous signals, such as brain-computer interface (BCI) systems. Despite significant advances in deep learning, interpreting human emotions remains a formidable challenge. In this work, we present iFuzzyAffectDuo, a novel computational model that integrates a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17445v1-abstract-full').style.display = 'inline'; document.getElementById('2502.17445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.17445v1-abstract-full" style="display: none;"> Fuzzy logic provides a robust framework for enhancing explainability, particularly in domains requiring the interpretation of complex and ambiguous signals, such as brain-computer interface (BCI) systems. Despite significant advances in deep learning, interpreting human emotions remains a formidable challenge. In this work, we present iFuzzyAffectDuo, a novel computational model that integrates a dual-filter fuzzy neural network architecture for improved detection and interpretation of emotional states from neuroimaging data. The model introduces a new membership function (MF) based on the Laplace distribution, achieving superior accuracy and interpretability compared to traditional approaches. By refining the extraction of neural signals associated with specific emotions, iFuzzyAffectDuo offers a human-understandable framework that unravels the underlying decision-making processes. We validate our approach across three neuroimaging datasets using functional Near-Infrared Spectroscopy (fNIRS) and Electroencephalography (EEG), demonstrating its potential to advance affective computing. These findings open new pathways for understanding the neural basis of emotions and their application in enhancing human-computer interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.17445v1-abstract-full').style.display = 'none'; document.getElementById('2502.17445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.15843">arXiv:2502.15843</a> <span> [<a href="https://arxiv.org/pdf/2502.15843">pdf</a>, <a href="https://arxiv.org/format/2502.15843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> </div> </div> <p class="title is-5 mathjax"> Implicit Neural Representations for Chemical Reaction Paths </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramakrishnan%2C+K">Kalyan Ramakrishnan</a>, <a href="/search/cs?searchtype=author&query=Schaaf%2C+L+L">Lars L. Schaaf</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chen Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangrun Wang</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.15843v1-abstract-short" style="display: inline;"> We show that neural networks can be optimized to represent minimum energy paths as continuous functions, offering a flexible alternative to discrete path-search methods like Nudged Elastic Band (NEB). Our approach parameterizes reaction paths with a network trained on a loss function that discards tangential energy gradients and enables instant estimation of the transition state. We first validate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15843v1-abstract-full').style.display = 'inline'; document.getElementById('2502.15843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.15843v1-abstract-full" style="display: none;"> We show that neural networks can be optimized to represent minimum energy paths as continuous functions, offering a flexible alternative to discrete path-search methods like Nudged Elastic Band (NEB). Our approach parameterizes reaction paths with a network trained on a loss function that discards tangential energy gradients and enables instant estimation of the transition state. We first validate the method on two-dimensional potentials and then demonstrate its advantages over NEB on challenging atomistic systems where (i) poor initial guesses yield unphysical paths, (ii) multiple competing paths exist, or (iii) the reaction follows a complex multi-step mechanism. Results highlight the versatility of the method -- for instance, a simple adjustment to the sampling strategy during optimization can help escape local-minimum solutions. Finally, in a low-dimensional setting, we demonstrate that a single neural network can learn from existing paths and generalize to unseen systems, showing promise for a universal reaction path representation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.15843v1-abstract-full').style.display = 'none'; document.getElementById('2502.15843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Intended for submission to the Journal of Chemical Physics. Once published, it will be available at [DOI/URL]</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14743">arXiv:2502.14743</a> <span> [<a href="https://arxiv.org/pdf/2502.14743">pdf</a>, <a href="https://arxiv.org/ps/2502.14743">ps</a>, <a href="https://arxiv.org/format/2502.14743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Coordination across Diverse Applications: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lijun Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yijun Yang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Q">Qiqi Duan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yuhui Shi</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chao Lyu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yu-Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yang Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14743v2-abstract-short" style="display: inline;"> Multi-agent coordination studies the underlying mechanism enabling the trending spread of diverse multi-agent systems (MAS) and has received increasing attention, driven by the expansion of emerging applications and rapid AI advances. This survey outlines the current state of coordination research across applications through a unified understanding that answers four fundamental coordination questi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14743v2-abstract-full').style.display = 'inline'; document.getElementById('2502.14743v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14743v2-abstract-full" style="display: none;"> Multi-agent coordination studies the underlying mechanism enabling the trending spread of diverse multi-agent systems (MAS) and has received increasing attention, driven by the expansion of emerging applications and rapid AI advances. This survey outlines the current state of coordination research across applications through a unified understanding that answers four fundamental coordination questions: (1) what is coordination; (2) why coordination; (3) who to coordinate with; and (4) how to coordinate. Our purpose is to explore existing ideas and expertise in coordination and their connections across diverse applications, while identifying and highlighting emerging and promising research directions. First, general coordination problems that are essential to varied applications are identified and analyzed. Second, a number of MAS applications are surveyed, ranging from widely studied domains, e.g., search and rescue, warehouse automation and logistics, and transportation systems, to emerging fields including humanoid and anthropomorphic robots, satellite systems, and large language models (LLMs). Finally, open challenges about the scalability, heterogeneity, and learning mechanisms of MAS are analyzed and discussed. In particular, we identify the hybridization of hierarchical and decentralized coordination, human-MAS coordination, and LLM-based MAS as promising future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14743v2-abstract-full').style.display = 'none'; document.getElementById('2502.14743v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 4 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14739">arXiv:2502.14739</a> <span> [<a href="https://arxiv.org/pdf/2502.14739">pdf</a>, <a href="https://arxiv.org/format/2502.14739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SuperGPQA: Scaling LLM Evaluation across 285 Graduate Disciplines </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Team%2C+M">M-A-P Team</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xinrun Du</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yifan Yao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+K">Kaijing Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingli Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kang Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Minghao Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yiming Liang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaolong Jin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenlin Wei</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chujie Zheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+K">Kaixing Deng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyue Guo</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+S">Shian Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Sichao Jiang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yiyan Liao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinrui Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sirun Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunwen Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">Dehua Ma</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuansheng Ni</a>, <a href="/search/cs?searchtype=author&query=Que%2C+H">Haoran Que</a> , et al. (70 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14739v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-orient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14739v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable proficiency in mainstream academic disciplines such as mathematics, physics, and computer science. However, human knowledge encompasses over 200 specialized disciplines, far exceeding the scope of existing benchmarks. The capabilities of LLMs in many of these specialized fields-particularly in light industry, agriculture, and service-oriented disciplines-remain inadequately evaluated. To address this gap, we present SuperGPQA, a comprehensive benchmark that evaluates graduate-level knowledge and reasoning capabilities across 285 disciplines. Our benchmark employs a novel Human-LLM collaborative filtering mechanism to eliminate trivial or ambiguous questions through iterative refinement based on both LLM responses and expert feedback. Our experimental results reveal significant room for improvement in the performance of current state-of-the-art LLMs across diverse knowledge domains (e.g., the reasoning-focused model DeepSeek-R1 achieved the highest accuracy of 61.82% on SuperGPQA), highlighting the considerable gap between current model capabilities and artificial general intelligence. Additionally, we present comprehensive insights from our management of a large-scale annotation process, involving over 80 expert annotators and an interactive Human-LLM collaborative system, offering valuable methodological guidance for future research initiatives of comparable scope. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14739v1-abstract-full').style.display = 'none'; document.getElementById('2502.14739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06907">arXiv:2502.06907</a> <span> [<a href="https://arxiv.org/pdf/2502.06907">pdf</a>, <a href="https://arxiv.org/format/2502.06907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can ChatGPT Diagnose Alzheimer's Disease? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+Q">Quoc-Toan Nguyen</a>, <a href="/search/cs?searchtype=author&query=Le%2C+L">Linh Le</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+X">Xuan-The Tran</a>, <a href="/search/cs?searchtype=author&query=Do%2C+T">Thomas Do</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06907v1-abstract-short" style="display: inline;"> Can ChatGPT diagnose Alzheimer's Disease (AD)? AD is a devastating neurodegenerative condition that affects approximately 1 in 9 individuals aged 65 and older, profoundly impairing memory and cognitive function. This paper utilises 9300 electronic health records (EHRs) with data from Magnetic Resonance Imaging (MRI) and cognitive tests to address an intriguing question: As a general-purpose task s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06907v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06907v1-abstract-full" style="display: none;"> Can ChatGPT diagnose Alzheimer's Disease (AD)? AD is a devastating neurodegenerative condition that affects approximately 1 in 9 individuals aged 65 and older, profoundly impairing memory and cognitive function. This paper utilises 9300 electronic health records (EHRs) with data from Magnetic Resonance Imaging (MRI) and cognitive tests to address an intriguing question: As a general-purpose task solver, can ChatGPT accurately detect AD using EHRs? We present an in-depth evaluation of ChatGPT using a black-box approach with zero-shot and multi-shot methods. This study unlocks ChatGPT's capability to analyse MRI and cognitive test results, as well as its potential as a diagnostic tool for AD. By automating aspects of the diagnostic process, this research opens a transformative approach for the healthcare system, particularly in addressing disparities in resource-limited regions where AD specialists are scarce. Hence, it offers a foundation for a promising method for early detection, supporting individuals with timely interventions, which is paramount for Quality of Life (QoL). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06907v1-abstract-full').style.display = 'none'; document.getElementById('2502.06907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 5 figures, 5 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06116">arXiv:2502.06116</a> <span> [<a href="https://arxiv.org/pdf/2502.06116">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event Vision Sensor: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+X">Xinyue Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junlin Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+W">Wenzhong Bao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chun Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honglei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06116v1-abstract-short" style="display: inline;"> By monitoring temporal contrast, event-based vision sensors can provide high temporal resolution and low latency while maintaining low power consumption and simplicity in circuit structure. These characteristics have garnered significant attention in both academia and industry. In recent years, the application of back-illuminated (BSI) technology, wafer stacking techniques, and industrial interfac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06116v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06116v1-abstract-full" style="display: none;"> By monitoring temporal contrast, event-based vision sensors can provide high temporal resolution and low latency while maintaining low power consumption and simplicity in circuit structure. These characteristics have garnered significant attention in both academia and industry. In recent years, the application of back-illuminated (BSI) technology, wafer stacking techniques, and industrial interfaces has brought new opportunities for enhancing the performance of event-based vision sensors. This is evident in the substantial advancements made in reducing noise, improving resolution, and increasing readout rates. Additionally, the integration of these technologies has enhanced the compatibility of event-based vision sensors with current and edge vision systems, providing greater possibilities for their practical applications. This paper will review the progression from neuromorphic engineering to state-of-the-art event-based vision sensor technologies, including their development trends, operating principles, and key features. Moreover, we will delve into the sensitivity of event-based vision sensors and the opportunities and challenges they face in the realm of infrared imaging, providing references for future research and applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06116v1-abstract-full').style.display = 'none'; document.getElementById('2502.06116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05589">arXiv:2502.05589</a> <span> [<a href="https://arxiv.org/pdf/2502.05589">pdf</a>, <a href="https://arxiv.org/format/2502.05589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> On Memory Construction and Retrieval for Personalized Conversational Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhuoshi Pan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qianhui Wu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Huiqiang Jiang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xufang Luo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Yew Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H+V">H. Vicky Zhao</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lili Qiu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jianfeng Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05589v2-abstract-short" style="display: inline;"> To deliver coherent and personalized experiences in long-term conversations, existing approaches typically perform retrieval augmented response generation by constructing memory banks from conversation history at either the turn-level, session-level, or through summarization techniques. In this paper, we present two key findings: (1) The granularity of memory unit matters: Turn-level, session-leve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05589v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05589v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05589v2-abstract-full" style="display: none;"> To deliver coherent and personalized experiences in long-term conversations, existing approaches typically perform retrieval augmented response generation by constructing memory banks from conversation history at either the turn-level, session-level, or through summarization techniques. In this paper, we present two key findings: (1) The granularity of memory unit matters: Turn-level, session-level, and summarization-based methods each exhibit limitations in both memory retrieval accuracy and the semantic quality of the retrieved content. (2) Prompt compression methods, such as \textit{LLMLingua-2}, can effectively serve as a denoising mechanism, enhancing memory retrieval accuracy across different granularities. Building on these insights, we propose SeCom, a method that constructs a memory bank with topical segments by introducing a conversation Segmentation model, while performing memory retrieval based on Compressed memory units. Experimental results show that SeCom outperforms turn-level, session-level, and several summarization-based methods on long-term conversation benchmarks such as LOCOMO and Long-MT-Bench+. Additionally, the proposed conversation segmentation method demonstrates superior performance on dialogue segmentation datasets such as DialSeg711, TIAGE, and SuperDialSeg. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05589v2-abstract-full').style.display = 'none'; document.getElementById('2502.05589v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 5 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05497">arXiv:2502.05497</a> <span> [<a href="https://arxiv.org/pdf/2502.05497">pdf</a>, <a href="https://arxiv.org/format/2502.05497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DeepThink: Aligning Language Models with Domain-Specific User Intents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Mingxuan Luo</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chen Lin</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jian Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaili Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05497v2-abstract-short" style="display: inline;"> Supervised fine-tuning with synthesized instructions has been a common practice for adapting LLMs to domain-specific QA tasks. However, the synthesized instructions deviate from real user questions and expected answers. This study proposes a novel framework called DeepThink to generate high-quality instructions. DeepThink first generates a few seed questions to mimic actual user questions, simulat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05497v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05497v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05497v2-abstract-full" style="display: none;"> Supervised fine-tuning with synthesized instructions has been a common practice for adapting LLMs to domain-specific QA tasks. However, the synthesized instructions deviate from real user questions and expected answers. This study proposes a novel framework called DeepThink to generate high-quality instructions. DeepThink first generates a few seed questions to mimic actual user questions, simulates conversations to uncover the hidden user needs, and refines the answer by conversational contexts and the retrieved documents for more comprehensive answers. Experiments demonstrate that DeepThink achieves an average performance improvement of 7.92% compared to a GPT-4-turbo+RAG-based assistant on the real user test set in the advertising domain across dimensions such as relevance, completeness, clarity, accuracy, and actionability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05497v2-abstract-full').style.display = 'none'; document.getElementById('2502.05497v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05176">arXiv:2502.05176</a> <span> [<a href="https://arxiv.org/pdf/2502.05176">pdf</a>, <a href="https://arxiv.org/format/2502.05176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AuraFusion360: Augmented Unseen Region Alignment for Reference-based 360掳 Unbounded Scene Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chung-Ho Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang-Jung Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Huan Chen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jie-Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+B">Bo-Hsu Ke</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+C+T">Chun-Wei Tuan Mu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi-Chuan Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Yang Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min-Hung Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Yu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Lun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05176v1-abstract-short" style="display: inline;"> Three-dimensional scene inpainting is crucial for applications from virtual reality to architectural visualization, yet existing methods struggle with view consistency and geometric accuracy in 360掳 unbounded scenes. We present AuraFusion360, a novel reference-based method that enables high-quality object removal and hole filling in 3D scenes represented by Gaussian Splatting. Our approach introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05176v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05176v1-abstract-full" style="display: none;"> Three-dimensional scene inpainting is crucial for applications from virtual reality to architectural visualization, yet existing methods struggle with view consistency and geometric accuracy in 360掳 unbounded scenes. We present AuraFusion360, a novel reference-based method that enables high-quality object removal and hole filling in 3D scenes represented by Gaussian Splatting. Our approach introduces (1) depth-aware unseen mask generation for accurate occlusion identification, (2) Adaptive Guided Depth Diffusion, a zero-shot method for accurate initial point placement without requiring additional training, and (3) SDEdit-based detail enhancement for multi-view coherence. We also introduce 360-USID, the first comprehensive dataset for 360掳 unbounded scene inpainting with ground truth. Extensive experiments demonstrate that AuraFusion360 significantly outperforms existing methods, achieving superior perceptual quality while maintaining geometric accuracy across dramatic viewpoint changes. See our project page for video results and the dataset at https://kkennethwu.github.io/aurafusion360/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05176v1-abstract-full').style.display = 'none'; document.getElementById('2502.05176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kkennethwu.github.io/aurafusion360/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05151">arXiv:2502.05151</a> <span> [<a href="https://arxiv.org/pdf/2502.05151">pdf</a>, <a href="https://arxiv.org/format/2502.05151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Transforming Science with Large Language Models: A Survey on AI-assisted Scientific Discovery, Experimentation, Content Generation, and Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Eger%2C+S">Steffen Eger</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yong Cao</a>, <a href="/search/cs?searchtype=author&query=D%27Souza%2C+J">Jennifer D'Souza</a>, <a href="/search/cs?searchtype=author&query=Geiger%2C+A">Andreas Geiger</a>, <a href="/search/cs?searchtype=author&query=Greisinger%2C+C">Christian Greisinger</a>, <a href="/search/cs?searchtype=author&query=Gross%2C+S">Stephanie Gross</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Y">Yufang Hou</a>, <a href="/search/cs?searchtype=author&query=Krenn%2C+B">Brigitte Krenn</a>, <a href="/search/cs?searchtype=author&query=Lauscher%2C+A">Anne Lauscher</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yizhi Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a>, <a href="/search/cs?searchtype=author&query=Moosavi%2C+N+S">Nafise Sadat Moosavi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Miller%2C+T">Tristan Miller</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05151v1-abstract-short" style="display: inline;"> With the advent of large multimodal language models, science is now at a threshold of an AI-based technological transformation. Recently, a plethora of new AI models and tools has been proposed, promising to empower researchers and academics worldwide to conduct their research more effectively and efficiently. This includes all aspects of the research cycle, especially (1) searching for relevant l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05151v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05151v1-abstract-full" style="display: none;"> With the advent of large multimodal language models, science is now at a threshold of an AI-based technological transformation. Recently, a plethora of new AI models and tools has been proposed, promising to empower researchers and academics worldwide to conduct their research more effectively and efficiently. This includes all aspects of the research cycle, especially (1) searching for relevant literature; (2) generating research ideas and conducting experimentation; generating (3) text-based and (4) multimodal content (e.g., scientific figures and diagrams); and (5) AI-based automatic peer review. In this survey, we provide an in-depth overview over these exciting recent developments, which promise to fundamentally alter the scientific research process for good. Our survey covers the five aspects outlined above, indicating relevant datasets, methods and results (including evaluation) as well as limitations and scope for future research. Ethical concerns regarding shortcomings of these tools and potential for misuse (fake science, plagiarism, harms to research integrity) take a particularly prominent place in our discussion. We hope that our survey will not only become a reference guide for newcomers to the field but also a catalyst for new AI-based initiatives in the area of "AI4Science". <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05151v1-abstract-full').style.display = 'none'; document.getElementById('2502.05151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Will be updated soon</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04103">arXiv:2502.04103</a> <span> [<a href="https://arxiv.org/pdf/2502.04103">pdf</a>, <a href="https://arxiv.org/format/2502.04103">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> VTutor: An Open-Source SDK for Generative AI-Powered Animated Pedagogical Agents with Multi-Media Output </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+E">Eason Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenyu Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xinyi Tang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+A">Aprille Xi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Canwen Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jionghao Lin</a>, <a href="/search/cs?searchtype=author&query=Koedinger%2C+K+R">Kenneth R Koedinger</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04103v2-abstract-short" style="display: inline;"> The rapid evolution of large language models (LLMs) has transformed human-computer interaction (HCI), but the interaction with LLMs is currently mainly focused on text-based interactions, while other multi-model approaches remain under-explored. This paper introduces VTutor, an open-source Software Development Kit (SDK) that combines generative AI with advanced animation technologies to create eng… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04103v2-abstract-full').style.display = 'inline'; document.getElementById('2502.04103v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04103v2-abstract-full" style="display: none;"> The rapid evolution of large language models (LLMs) has transformed human-computer interaction (HCI), but the interaction with LLMs is currently mainly focused on text-based interactions, while other multi-model approaches remain under-explored. This paper introduces VTutor, an open-source Software Development Kit (SDK) that combines generative AI with advanced animation technologies to create engaging, adaptable, and realistic APAs for human-AI multi-media interactions. VTutor leverages LLMs for real-time personalized feedback, advanced lip synchronization for natural speech alignment, and WebGL rendering for seamless web integration. Supporting various 2D and 3D character models, VTutor enables researchers and developers to design emotionally resonant, contextually adaptive learning agents. This toolkit enhances learner engagement, feedback receptivity, and human-AI interaction while promoting trustworthy AI principles in education. VTutor sets a new standard for next-generation APAs, offering an accessible, scalable solution for fostering meaningful and immersive human-AI interaction experiences. The VTutor project is open-sourced and welcomes community-driven contributions and showcases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04103v2-abstract-full').style.display = 'none'; document.getElementById('2502.04103v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03998">arXiv:2502.03998</a> <span> [<a href="https://arxiv.org/pdf/2502.03998">pdf</a>, <a href="https://arxiv.org/format/2502.03998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Online Learning of Counter Categories and Ratings in PvP Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chiu-Chou Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+I">I-Chen Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03998v1-abstract-short" style="display: inline;"> In competitive games, strength ratings like Elo are widely used to quantify player skill and support matchmaking by accounting for skill disparities better than simple win rate statistics. However, scalar ratings cannot handle complex intransitive relationships, such as counter strategies seen in Rock-Paper-Scissors. To address this, recent work introduced Neural Rating Table and Neural Counter Ta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03998v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03998v1-abstract-full" style="display: none;"> In competitive games, strength ratings like Elo are widely used to quantify player skill and support matchmaking by accounting for skill disparities better than simple win rate statistics. However, scalar ratings cannot handle complex intransitive relationships, such as counter strategies seen in Rock-Paper-Scissors. To address this, recent work introduced Neural Rating Table and Neural Counter Table, which combine scalar ratings with discrete counter categories to model intransitivity. While effective, these methods rely on neural network training and cannot perform real-time updates. In this paper, we propose an online update algorithm that extends Elo principles to incorporate real-time learning of counter categories. Our method dynamically adjusts both ratings and counter relationships after each match, preserving the explainability of scalar ratings while addressing intransitivity. Experiments on zero-sum competitive games demonstrate its practicality, particularly in scenarios without complex team compositions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03998v1-abstract-full').style.display = 'none'; document.getElementById('2502.03998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02770">arXiv:2502.02770</a> <span> [<a href="https://arxiv.org/pdf/2502.02770">pdf</a>, <a href="https://arxiv.org/format/2502.02770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Twilight: Adaptive Attention Sparsity with Hierarchical Top-$p$ Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chaofan Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiaming Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanshuo Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tian Tang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+B">Boyu Tian</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mingyu Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02770v2-abstract-short" style="display: inline;"> Leveraging attention sparsity to accelerate long-context large language models (LLMs) has been a hot research topic. However, current algorithms such as sparse attention or key-value (KV) cache compression tend to use a fixed budget, which presents a significant challenge during deployment because it fails to account for the dynamic nature of real-world scenarios, where the optimal balance between… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02770v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02770v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02770v2-abstract-full" style="display: none;"> Leveraging attention sparsity to accelerate long-context large language models (LLMs) has been a hot research topic. However, current algorithms such as sparse attention or key-value (KV) cache compression tend to use a fixed budget, which presents a significant challenge during deployment because it fails to account for the dynamic nature of real-world scenarios, where the optimal balance between accuracy and efficiency can vary greatly. In this paper, we find that borrowing top-$p$ sampling (nucleus sampling) to sparse attention can surprisingly achieve adaptive budgeting. Based on this, we propose Twilight, a framework to bring adaptive sparsity to any existing sparse attention algorithm without sacrificing their accuracy. Empirical results show that Twilight can adaptively prune at most 98% of redundant tokens, leading to $15.4\times$ acceleration in self-attention operations and $3.9\times$ acceleration in end-to-end per token latency in long context LLM decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02770v2-abstract-full').style.display = 'none'; document.getElementById('2502.02770v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02322">arXiv:2502.02322</a> <span> [<a href="https://arxiv.org/pdf/2502.02322">pdf</a>, <a href="https://arxiv.org/format/2502.02322">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Improving Generalization Ability for 3D Object Detection by Learning Sparsity-invariant Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Hsin-Cheng Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chung-Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+W+H">Winston H. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02322v1-abstract-short" style="display: inline;"> In autonomous driving, 3D object detection is essential for accurately identifying and tracking objects. Despite the continuous development of various technologies for this task, a significant drawback is observed in most of them-they experience substantial performance degradation when detecting objects in unseen domains. In this paper, we propose a method to improve the generalization ability for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02322v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02322v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02322v1-abstract-full" style="display: none;"> In autonomous driving, 3D object detection is essential for accurately identifying and tracking objects. Despite the continuous development of various technologies for this task, a significant drawback is observed in most of them-they experience substantial performance degradation when detecting objects in unseen domains. In this paper, we propose a method to improve the generalization ability for 3D object detection on a single domain. We primarily focus on generalizing from a single source domain to target domains with distinct sensor configurations and scene distributions. To learn sparsity-invariant features from a single source domain, we selectively subsample the source data to a specific beam, using confidence scores determined by the current detector to identify the density that holds utmost importance for the detector. Subsequently, we employ the teacher-student framework to align the Bird's Eye View (BEV) features for different point clouds densities. We also utilize feature content alignment (FCA) and graph-based embedding relationship alignment (GERA) to instruct the detector to be domain-agnostic. Extensive experiments demonstrate that our method exhibits superior generalization capabilities compared to other baselines. Furthermore, our approach even outperforms certain domain adaptation methods that can access to the target domain data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02322v1-abstract-full').style.display = 'none'; document.getElementById('2502.02322v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICRA 2025. Code is available at https://github.com/Tiffamy/3DOD-LSF</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01045">arXiv:2502.01045</a> <span> [<a href="https://arxiv.org/pdf/2502.01045">pdf</a>, <a href="https://arxiv.org/format/2502.01045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> WonderHuman: Hallucinating Unseen Parts in Dynamic 3D Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilong Wang</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xiao Dong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yunhui Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenxu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaohu Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01045v1-abstract-short" style="display: inline;"> In this paper, we present WonderHuman to reconstruct dynamic human avatars from a monocular video for high-fidelity novel view synthesis. Previous dynamic human avatar reconstruction methods typically require the input video to have full coverage of the observed human body. However, in daily practice, one typically has access to limited viewpoints, such as monocular front-view videos, making it a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01045v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01045v1-abstract-full" style="display: none;"> In this paper, we present WonderHuman to reconstruct dynamic human avatars from a monocular video for high-fidelity novel view synthesis. Previous dynamic human avatar reconstruction methods typically require the input video to have full coverage of the observed human body. However, in daily practice, one typically has access to limited viewpoints, such as monocular front-view videos, making it a cumbersome task for previous methods to reconstruct the unseen parts of the human avatar. To tackle the issue, we present WonderHuman, which leverages 2D generative diffusion model priors to achieve high-quality, photorealistic reconstructions of dynamic human avatars from monocular videos, including accurate rendering of unseen body parts. Our approach introduces a Dual-Space Optimization technique, applying Score Distillation Sampling (SDS) in both canonical and observation spaces to ensure visual consistency and enhance realism in dynamic human reconstruction. Additionally, we present a View Selection strategy and Pose Feature Injection to enforce the consistency between SDS predictions and observed data, ensuring pose-dependent effects and higher fidelity in the reconstructed avatar. In the experiments, our method achieves SOTA performance in producing photorealistic renderings from the given monocular video, particularly for those challenging unseen parts. The project page and source code can be found at https://wyiguanw.github.io/WonderHuman/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01045v1-abstract-full').style.display = 'none'; document.getElementById('2502.01045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00381">arXiv:2502.00381</a> <span> [<a href="https://arxiv.org/pdf/2502.00381">pdf</a>, <a href="https://arxiv.org/format/2502.00381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Towards a Supporting Framework for Neuro-Developmental Disorder: Considering Artificial Intelligence, Serious Games and Eye Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rehman%2C+A">Abdul Rehman</a>, <a href="/search/cs?searchtype=author&query=Heldal%2C+I">Ilona Heldal</a>, <a href="/search/cs?searchtype=author&query=Stilwell%2C+D">Diana Stilwell</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+C">Jerry Chun-Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00381v1-abstract-short" style="display: inline;"> This paper focuses on developing a framework for uncovering insights about NDD children's performance (e.g., raw gaze cluster analysis, duration analysis \& area of interest for sustained attention, stimuli expectancy, loss of focus/motivation, inhibitory control) and informing their teachers. The hypothesis behind this work is that self-adaptation of games can contribute to improving students' we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00381v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00381v1-abstract-full" style="display: none;"> This paper focuses on developing a framework for uncovering insights about NDD children's performance (e.g., raw gaze cluster analysis, duration analysis \& area of interest for sustained attention, stimuli expectancy, loss of focus/motivation, inhibitory control) and informing their teachers. The hypothesis behind this work is that self-adaptation of games can contribute to improving students' well-being and performance by suggesting personalized activities (e.g., highlighting stimuli to increase attention or choosing a difficulty level that matches students' abilities). The aim is to examine how AI can be used to help solve this problem. The results would not only contribute to a better understanding of the problems of NDD children and their teachers but also help psychologists to validate the results against their clinical knowledge, improve communication with patients and identify areas for further investigation, e.g., by explaining the decision made and preserving the children's private data in the learning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00381v1-abstract-full').style.display = 'none'; document.getElementById('2502.00381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2024 IEEE International Conference on Big Data (BigData) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00376">arXiv:2502.00376</a> <span> [<a href="https://arxiv.org/pdf/2502.00376">pdf</a>, <a href="https://arxiv.org/format/2502.00376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SSRepL-ADHD: Adaptive Complex Representation Learning Framework for ADHD Detection from Visual Attention Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rehman%2C+A">Abdul Rehman</a>, <a href="/search/cs?searchtype=author&query=Heldal%2C+I">Ilona Heldal</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+C">Jerry Chun-Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00376v1-abstract-short" style="display: inline;"> Self Supervised Representation Learning (SSRepL) can capture meaningful and robust representations of the Attention Deficit Hyperactivity Disorder (ADHD) data and have the potential to improve the model's performance on also downstream different types of Neurodevelopmental disorder (NDD) detection. In this paper, a novel SSRepL and Transfer Learning (TL)-based framework that incorporates a Long Sh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00376v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00376v1-abstract-full" style="display: none;"> Self Supervised Representation Learning (SSRepL) can capture meaningful and robust representations of the Attention Deficit Hyperactivity Disorder (ADHD) data and have the potential to improve the model's performance on also downstream different types of Neurodevelopmental disorder (NDD) detection. In this paper, a novel SSRepL and Transfer Learning (TL)-based framework that incorporates a Long Short-Term Memory (LSTM) and a Gated Recurrent Units (GRU) model is proposed to detect children with potential symptoms of ADHD. This model uses Electroencephalogram (EEG) signals extracted during visual attention tasks to accurately detect ADHD by preprocessing EEG signal quality through normalization, filtering, and data balancing. For the experimental analysis, we use three different models: 1) SSRepL and TL-based LSTM-GRU model named as SSRepL-ADHD, which integrates LSTM and GRU layers to capture temporal dependencies in the data, 2) lightweight SSRepL-based DNN model (LSSRepL-DNN), and 3) Random Forest (RF). In the study, these models are thoroughly evaluated using well-known performance metrics (i.e., accuracy, precision, recall, and F1-score). The results show that the proposed SSRepL-ADHD model achieves the maximum accuracy of 81.11% while admitting the difficulties associated with dataset imbalance and feature selection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00376v1-abstract-full').style.display = 'none'; document.getElementById('2502.00376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 2024 IEEE International Conference on Big Data (BigData) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00050">arXiv:2502.00050</a> <span> [<a href="https://arxiv.org/pdf/2502.00050">pdf</a>, <a href="https://arxiv.org/format/2502.00050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DISC: Dataset for Analyzing Driving Styles In Simulated Crashes for Mixed Autonomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+S+S+S">Sandip Sharan Senthil Kumar</a>, <a href="/search/cs?searchtype=author&query=Thalapanane%2C+S">Sandeep Thalapanane</a>, <a href="/search/cs?searchtype=author&query=Peethambari%2C+G+N+A+D">Guru Nandhan Appiya Dilipkumar Peethambari</a>, <a href="/search/cs?searchtype=author&query=SriHari%2C+S">Sourang SriHari</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Laura Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M+C">Ming C. Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00050v1-abstract-short" style="display: inline;"> Handling pre-crash scenarios is still a major challenge for self-driving cars due to limited practical data and human-driving behavior datasets. We introduce DISC (Driving Styles In Simulated Crashes), one of the first datasets designed to capture various driving styles and behaviors in pre-crash scenarios for mixed autonomy analysis. DISC includes over 8 classes of driving styles/behaviors from h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00050v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00050v1-abstract-full" style="display: none;"> Handling pre-crash scenarios is still a major challenge for self-driving cars due to limited practical data and human-driving behavior datasets. We introduce DISC (Driving Styles In Simulated Crashes), one of the first datasets designed to capture various driving styles and behaviors in pre-crash scenarios for mixed autonomy analysis. DISC includes over 8 classes of driving styles/behaviors from hundreds of drivers navigating a simulated vehicle through a virtual city, encountering rare-event traffic scenarios. This dataset enables the classification of pre-crash human driving behaviors in unsafe conditions, supporting individualized trajectory prediction based on observed driving patterns. By utilizing a custom-designed VR-based in-house driving simulator, TRAVERSE, data was collected through a driver-centric study involving human drivers encountering twelve simulated accident scenarios. This dataset fills a critical gap in human-centric driving data for rare events involving interactions with autonomous vehicles. It enables autonomous systems to better react to human drivers and optimize trajectory prediction in mixed autonomy environments involving both human-driven and self-driving cars. In addition, individual driving behaviors are classified through a set of standardized questionnaires, carefully designed to identify and categorize driving behavior traits. We correlate data features with driving behaviors, showing that the simulated environment reflects real-world driving styles. DISC is the first dataset to capture how various driving styles respond to accident scenarios, offering significant potential to enhance autonomous vehicle safety and driving behavior analysis in mixed autonomy environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00050v1-abstract-full').style.display = 'none'; document.getElementById('2502.00050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18982">arXiv:2501.18982</a> <span> [<a href="https://arxiv.org/pdf/2501.18982">pdf</a>, <a href="https://arxiv.org/format/2501.18982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniPhysGS: 3D Constitutive Gaussians for General Physics-Based Dynamics Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenguo Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jianjin Xu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yadong Mu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18982v1-abstract-short" style="display: inline;"> Recently, significant advancements have been made in the reconstruction and generation of 3D assets, including static cases and those with physical interactions. To recover the physical properties of 3D assets, existing methods typically assume that all materials belong to a specific predefined category (e.g., elasticity). However, such assumptions ignore the complex composition of multiple hetero… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18982v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18982v1-abstract-full" style="display: none;"> Recently, significant advancements have been made in the reconstruction and generation of 3D assets, including static cases and those with physical interactions. To recover the physical properties of 3D assets, existing methods typically assume that all materials belong to a specific predefined category (e.g., elasticity). However, such assumptions ignore the complex composition of multiple heterogeneous objects in real scenarios and tend to render less physically plausible animation given a wider range of objects. We propose OmniPhysGS for synthesizing a physics-based 3D dynamic scene composed of more general objects. A key design of OmniPhysGS is treating each 3D asset as a collection of constitutive 3D Gaussians. For each Gaussian, its physical material is represented by an ensemble of 12 physical domain-expert sub-models (rubber, metal, honey, water, etc.), which greatly enhances the flexibility of the proposed model. In the implementation, we define a scene by user-specified prompts and supervise the estimation of material weighting factors via a pretrained video diffusion model. Comprehensive experiments demonstrate that OmniPhysGS achieves more general and realistic physical dynamics across a broader spectrum of materials, including elastic, viscoelastic, plastic, and fluid substances, as well as interactions between different materials. Our method surpasses existing methods by approximately 3% to 16% in metrics of visual quality and text alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18982v1-abstract-full').style.display = 'none'; document.getElementById('2501.18982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025; Project page: https://wgsxm.github.io/projects/omniphysgs/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18588">arXiv:2501.18588</a> <span> [<a href="https://arxiv.org/pdf/2501.18588">pdf</a>, <a href="https://arxiv.org/format/2501.18588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713397">10.1145/3706598.3713397 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Inkspire: Supporting Design Exploration with Generative AI through Analogical Sketching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+D+C">David Chuan-En Lin</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H+B">Hyeonsu B. Kang</a>, <a href="/search/cs?searchtype=author&query=Martelaro%2C+N">Nikolas Martelaro</a>, <a href="/search/cs?searchtype=author&query=Kittur%2C+A">Aniket Kittur</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan-Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M+K">Matthew K. Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18588v1-abstract-short" style="display: inline;"> With recent advancements in the capabilities of Text-to-Image (T2I) AI models, product designers have begun experimenting with them in their work. However, T2I models struggle to interpret abstract language and the current user experience of T2I tools can induce design fixation rather than a more iterative, exploratory process. To address these challenges, we developed Inkspire, a sketch-driven to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18588v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18588v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18588v1-abstract-full" style="display: none;"> With recent advancements in the capabilities of Text-to-Image (T2I) AI models, product designers have begun experimenting with them in their work. However, T2I models struggle to interpret abstract language and the current user experience of T2I tools can induce design fixation rather than a more iterative, exploratory process. To address these challenges, we developed Inkspire, a sketch-driven tool that supports designers in prototyping product design concepts with analogical inspirations and a complete sketch-to-design-to-sketch feedback loop. To inform the design of Inkspire, we conducted an exchange session with designers and distilled design goals for improving T2I interactions. In a within-subjects study comparing Inkspire to ControlNet, we found that Inkspire supported designers with more inspiration and exploration of design ideas, and improved aspects of the co-creative process by allowing designers to effectively grasp the current state of the AI to guide it towards novel design intentions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18588v1-abstract-full').style.display = 'none'; document.getElementById('2501.18588v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CHI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17790">arXiv:2501.17790</a> <span> [<a href="https://arxiv.org/pdf/2501.17790">pdf</a>, <a href="https://arxiv.org/format/2501.17790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BreezyVoice: Adapting TTS for Taiwanese Mandarin with Enhanced Polyphone Disambiguation -- Challenges and Insights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chia-Chun Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi-Chang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chien-Yu Yu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Ming-Ji Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chien-Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ru-Heng Huang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Shiu%2C+D">Da-Shan Shiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17790v1-abstract-short" style="display: inline;"> We present BreezyVoice, a Text-to-Speech (TTS) system specifically adapted for Taiwanese Mandarin, highlighting phonetic control abilities to address the unique challenges of polyphone disambiguation in the language. Building upon CosyVoice, we incorporate a $S^{3}$ tokenizer, a large language model (LLM), an optimal-transport conditional flow matching model (OT-CFM), and a grapheme to phoneme pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17790v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17790v1-abstract-full" style="display: none;"> We present BreezyVoice, a Text-to-Speech (TTS) system specifically adapted for Taiwanese Mandarin, highlighting phonetic control abilities to address the unique challenges of polyphone disambiguation in the language. Building upon CosyVoice, we incorporate a $S^{3}$ tokenizer, a large language model (LLM), an optimal-transport conditional flow matching model (OT-CFM), and a grapheme to phoneme prediction model, to generate realistic speech that closely mimics human utterances. Our evaluation demonstrates BreezyVoice's superior performance in both general and code-switching contexts, highlighting its robustness and effectiveness in generating high-fidelity speech. Additionally, we address the challenges of generalizability in modeling long-tail speakers and polyphone disambiguation. Our approach significantly enhances performance and offers valuable insights into the workings of neural codec TTS systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17790v1-abstract-full').style.display = 'none'; document.getElementById('2501.17790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17489">arXiv:2501.17489</a> <span> [<a href="https://arxiv.org/pdf/2501.17489">pdf</a>, <a href="https://arxiv.org/format/2501.17489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neural Spelling: A Spell-Based BCI System for Language Neural Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaowei Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Charles Zhou</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yiqun Duan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ziyi Zhao</a>, <a href="/search/cs?searchtype=author&query=Do%2C+T">Thomas Do</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17489v1-abstract-short" style="display: inline;"> Brain-computer interfaces (BCIs) present a promising avenue by translating neural activity directly into text, eliminating the need for physical actions. However, existing non-invasive BCI systems have not successfully covered the entire alphabet, limiting their practicality. In this paper, we propose a novel non-invasive EEG-based BCI system with Curriculum-based Neural Spelling Framework, which… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17489v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17489v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17489v1-abstract-full" style="display: none;"> Brain-computer interfaces (BCIs) present a promising avenue by translating neural activity directly into text, eliminating the need for physical actions. However, existing non-invasive BCI systems have not successfully covered the entire alphabet, limiting their practicality. In this paper, we propose a novel non-invasive EEG-based BCI system with Curriculum-based Neural Spelling Framework, which recognizes all 26 alphabet letters by decoding neural signals associated with handwriting first, and then apply a Generative AI (GenAI) to enhance spell-based neural language decoding tasks. Our approach combines the ease of handwriting with the accessibility of EEG technology, utilizing advanced neural decoding algorithms and pre-trained large language models (LLMs) to translate EEG patterns into text with high accuracy. This system show how GenAI can improve the performance of typical spelling-based neural language decoding task, and addresses the limitations of previous methods, offering a scalable and user-friendly solution for individuals with communication impairments, thereby enhancing inclusive communication options. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17489v1-abstract-full').style.display = 'none'; document.getElementById('2501.17489v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16969">arXiv:2501.16969</a> <span> [<a href="https://arxiv.org/pdf/2501.16969">pdf</a>, <a href="https://arxiv.org/format/2501.16969">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> What Really Matters for Learning-based LiDAR-Camera Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shujuan Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chunyu Lin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16969v1-abstract-short" style="display: inline;"> Calibration is an essential prerequisite for the accurate data fusion of LiDAR and camera sensors. Traditional calibration techniques often require specific targets or suitable scenes to obtain reliable 2D-3D correspondences. To tackle the challenge of target-less and online calibration, deep neural networks have been introduced to solve the problem in a data-driven manner. While previous learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16969v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16969v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16969v1-abstract-full" style="display: none;"> Calibration is an essential prerequisite for the accurate data fusion of LiDAR and camera sensors. Traditional calibration techniques often require specific targets or suitable scenes to obtain reliable 2D-3D correspondences. To tackle the challenge of target-less and online calibration, deep neural networks have been introduced to solve the problem in a data-driven manner. While previous learning-based methods have achieved impressive performance on specific datasets, they still struggle in complex real-world scenarios. Most existing works focus on improving calibration accuracy but overlook the underlying mechanisms. In this paper, we revisit the development of learning-based LiDAR-Camera calibration and encourage the community to pay more attention to the underlying principles to advance practical applications. We systematically analyze the paradigm of mainstream learning-based methods, and identify the critical limitations of regression-based methods with the widely used data generation pipeline. Our findings reveal that most learning-based methods inadvertently operate as retrieval networks, focusing more on single-modality distributions rather than cross-modality correspondences. We also investigate how the input data format and preprocessing operations impact network performance and summarize the regression clues to inform further improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16969v1-abstract-full').style.display = 'none'; document.getElementById('2501.16969v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16764">arXiv:2501.16764</a> <span> [<a href="https://arxiv.org/pdf/2501.16764">pdf</a>, <a href="https://arxiv.org/format/2501.16764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenguo Lin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+P">Panwang Pan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bangbang Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zeming Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yadong Mu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16764v1-abstract-short" style="display: inline;"> Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-sca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16764v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16764v1-abstract-full" style="display: none;"> Recent advancements in 3D content generation from text or a single image struggle with limited high-quality 3D datasets and inconsistency from 2D multi-view generation. We introduce DiffSplat, a novel 3D generative framework that natively generates 3D Gaussian splats by taming large-scale text-to-image diffusion models. It differs from previous 3D generative models by effectively utilizing web-scale 2D priors while maintaining 3D consistency in a unified model. To bootstrap the training, a lightweight reconstruction model is proposed to instantly produce multi-view Gaussian splat grids for scalable dataset curation. In conjunction with the regular diffusion loss on these grids, a 3D rendering loss is introduced to facilitate 3D coherence across arbitrary views. The compatibility with image diffusion models enables seamless adaptions of numerous techniques for image generation to the 3D realm. Extensive experiments reveal the superiority of DiffSplat in text- and image-conditioned generation tasks and downstream applications. Thorough ablation studies validate the efficacy of each critical design choice and provide insights into the underlying mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16764v1-abstract-full').style.display = 'none'; document.getElementById('2501.16764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025; Project page: https://chenguolin.github.io/projects/DiffSplat</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15316">arXiv:2501.15316</a> <span> [<a href="https://arxiv.org/pdf/2501.15316">pdf</a>, <a href="https://arxiv.org/format/2501.15316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ToMoE: Converting Dense Large Language Models to Mixture-of-Experts through Dynamic Structural Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shangqian Gao</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+T">Ting Hua</a>, <a href="/search/cs?searchtype=author&query=Shirkavand%2C+R">Reza Shirkavand</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chi-Heng Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zhen Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengao Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Longge Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fangyi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ganjdanesh%2C+A">Alireza Ganjdanesh</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+L">Lou Qian</a>, <a href="/search/cs?searchtype=author&query=Jie%2C+X">Xu Jie</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+Y">Yen-Chang Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15316v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable abilities in tackling a wide range of complex tasks. However, their huge computational and memory costs raise significant challenges in deploying these models on resource-constrained devices or efficiently serving them. Prior approaches have attempted to alleviate these problems by permanently removing less important model structures, yet t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15316v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15316v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15316v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable abilities in tackling a wide range of complex tasks. However, their huge computational and memory costs raise significant challenges in deploying these models on resource-constrained devices or efficiently serving them. Prior approaches have attempted to alleviate these problems by permanently removing less important model structures, yet these methods often result in substantial performance degradation due to the permanent deletion of model parameters. In this work, we tried to mitigate this issue by reducing the number of active parameters without permanently removing them. Specifically, we introduce a differentiable dynamic pruning method that pushes dense models to maintain a fixed number of active parameters by converting their MLP layers into a Mixture of Experts (MoE) architecture. Our method, even without fine-tuning, consistently outperforms previous structural pruning techniques across diverse model families, including Phi-2, LLaMA-2, LLaMA-3, and Qwen-2.5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15316v1-abstract-full').style.display = 'none'; document.getElementById('2501.15316v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14713">arXiv:2501.14713</a> <span> [<a href="https://arxiv.org/pdf/2501.14713">pdf</a>, <a href="https://arxiv.org/format/2501.14713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FlexiGPT: Pruning and Extending Large Language Models with Low-Rank Weight Sharing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+J+S">James Seale Smith</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chi-Heng Lin</a>, <a href="/search/cs?searchtype=author&query=Tuli%2C+S">Shikhar Tuli</a>, <a href="/search/cs?searchtype=author&query=Jeelani%2C+H">Haris Jeelani</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shangqian Gao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yilin Shen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hongxia Jin</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+Y">Yen-Chang Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14713v2-abstract-short" style="display: inline;"> The rapid proliferation of large language models (LLMs) in natural language processing (NLP) has created a critical need for techniques that enable efficient deployment on memory-constrained devices without compromising performance. We present a method to prune LLMs that selectively prunes model blocks based on an importance score and replaces them with a low-parameter replacement strategy. Specif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14713v2-abstract-full').style.display = 'inline'; document.getElementById('2501.14713v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14713v2-abstract-full" style="display: none;"> The rapid proliferation of large language models (LLMs) in natural language processing (NLP) has created a critical need for techniques that enable efficient deployment on memory-constrained devices without compromising performance. We present a method to prune LLMs that selectively prunes model blocks based on an importance score and replaces them with a low-parameter replacement strategy. Specifically, we propose a principled metric to replace each pruned block using a weight-sharing mechanism that leverages unpruned counterparts from the model and block-specific low-rank adapters. Furthermore, we facilitate the learning of these replacement blocks with output feature normalization and an adapter initialization scheme built on low-rank SVD reconstructions. Empirical evaluations demonstrate substantial performance gains over existing methods, achieving state-of-the-art performance on 5/6 benchmarks for a compression rate of 30% and 6/6 benchmarks for a compression rate of 40%. We also demonstrate that our approach can extend smaller models, boosting performance on 6/6 benchmarks using only ~0.3% tokens of extended training with minimal additional parameter costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14713v2-abstract-full').style.display = 'none'; document.getElementById('2501.14713v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025 - Main Conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14315">arXiv:2501.14315</a> <span> [<a href="https://arxiv.org/pdf/2501.14315">pdf</a>, <a href="https://arxiv.org/format/2501.14315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Clear Minds Think Alike: What Makes LLM Fine-tuning Robust? A Study of Token Perplexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chao-Chung Wu</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+Z+R">Zhi Rui Tam</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chieh-Yen Lin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yun-Nung Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14315v1-abstract-short" style="display: inline;"> Maintaining consistent model performance across domains is a fundamental challenge in machine learning. While recent work has explored using LLM-generated data for fine-tuning, its impact on cross-domain generalization remains poorly understood. In this paper, we present a systematic analysis revealing that fine-tuning with LLM-generated data not only improves target task performance but also redu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14315v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14315v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14315v1-abstract-full" style="display: none;"> Maintaining consistent model performance across domains is a fundamental challenge in machine learning. While recent work has explored using LLM-generated data for fine-tuning, its impact on cross-domain generalization remains poorly understood. In this paper, we present a systematic analysis revealing that fine-tuning with LLM-generated data not only improves target task performance but also reduces out-of-domain (OOD) degradation compared to fine-tuning with ground truth data. Through analyzing the data sequence in tasks of various domains, we demonstrate that this enhanced OOD robustness stems from a reduced prevalence of high perplexity tokens in LLM-generated sequences. Following this hypothesis we showed that masking high perplexity tokens in ground truth training data also achieves similar OOD preservation comparable to using LLM-generated data. Extensive experiments across diverse model architectures and scales, including Gemma2-2B, Mistral-7B and Llama3-8B, corroborate the consistency of our findings. To the best of our knowledge, this work provides the first mechanistic explanation for the superior OOD robustness conferred by LLM-generated training data, offering valuable insights for developing more robust fine-tuning strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14315v1-abstract-full').style.display = 'none'; document.getElementById('2501.14315v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13652">arXiv:2501.13652</a> <span> [<a href="https://arxiv.org/pdf/2501.13652">pdf</a>, <a href="https://arxiv.org/format/2501.13652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LVPruning: An Effective yet Simple Language-Guided Vision Token Pruning Approach for Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yizheng Sun</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yanze Xin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jingyuan Sun</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a>, <a href="/search/cs?searchtype=author&query=Batista-Navarro%2C+R">Riza Batista-Navarro</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13652v1-abstract-short" style="display: inline;"> Multi-modal Large Language Models (MLLMs) have achieved remarkable success by integrating visual and textual modalities. However, they incur significant computational overhead due to the large number of vision tokens processed, limiting their practicality in resource-constrained environments. We introduce Language-Guided Vision Token Pruning (LVPruning) for MLLMs, an effective yet simple method th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13652v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13652v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13652v1-abstract-full" style="display: none;"> Multi-modal Large Language Models (MLLMs) have achieved remarkable success by integrating visual and textual modalities. However, they incur significant computational overhead due to the large number of vision tokens processed, limiting their practicality in resource-constrained environments. We introduce Language-Guided Vision Token Pruning (LVPruning) for MLLMs, an effective yet simple method that significantly reduces the computational burden while preserving model performance. LVPruning employs cross-attention modules to compute the importance of vision tokens based on their interaction with language tokens, determining which to prune. Importantly, LVPruning can be integrated without modifying the original MLLM parameters, which makes LVPruning simple to apply or remove. Our experiments show that LVPruning can effectively reduce up to 90% of vision tokens by the middle layer of LLaVA-1.5, resulting in a 62.1% decrease in inference Tera Floating-Point Operations Per Second (TFLOPs), with an average performance loss of just 0.45% across nine multi-modal benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13652v1-abstract-full').style.display = 'none'; document.getElementById('2501.13652v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13629">arXiv:2501.13629</a> <span> [<a href="https://arxiv.org/pdf/2501.13629">pdf</a>, <a href="https://arxiv.org/format/2501.13629">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Sigma: Differential Rescaling of Query, Key and Value for Efficient Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhenghao Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zihao Tang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yeyun Gong</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hang Li</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Ying Xin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyue Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kailai Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yu Yan</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiao Liang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shuai Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiming Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zheheng Luo</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lei Qu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xuan Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yuqing Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+F">Feiyang Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuting Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yasen Hu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+H">Hao Ni</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guoshuai Zhao</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13629v2-abstract-short" style="display: inline;"> We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13629v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13629v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13629v2-abstract-full" style="display: none;"> We introduce Sigma, an efficient large language model specialized for the system domain, empowered by a novel architecture including DiffQKV attention, and pre-trained on our meticulously collected system domain data. DiffQKV attention significantly enhances the inference efficiency of Sigma by optimizing the Query (Q), Key (K), and Value (V) components in the attention mechanism differentially, based on their varying impacts on the model performance and efficiency indicators. Specifically, we (1) conduct extensive experiments that demonstrate the model's varying sensitivity to the compression of K and V components, leading to the development of differentially compressed KV, and (2) propose augmented Q to expand the Q head dimension, which enhances the model's representation capacity with minimal impacts on the inference speed. Rigorous theoretical and empirical analyses reveal that DiffQKV attention significantly enhances efficiency, achieving up to a 33.36% improvement in inference speed over the conventional grouped-query attention (GQA) in long-context scenarios. We pre-train Sigma on 6T tokens from various sources, including 19.5B system domain data that we carefully collect and 1T tokens of synthesized and rewritten data. In general domains, Sigma achieves comparable performance to other state-of-arts models. In the system domain, we introduce the first comprehensive benchmark AIMicius, where Sigma demonstrates remarkable performance across all tasks, significantly outperforming GPT-4 with an absolute improvement up to 52.5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13629v2-abstract-full').style.display = 'none'; document.getElementById('2501.13629v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13132">arXiv:2501.13132</a> <span> [<a href="https://arxiv.org/pdf/2501.13132">pdf</a>, <a href="https://arxiv.org/format/2501.13132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical Reinforcement Learning Framework for Multi-UAV Combat Using Leader-Follower Strategy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jinhui Pang</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jinglin He</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+N+M+A+A">Noureldin Mohamed Abdelaal Ahmed Mohamed</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Changqing Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihui Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13132v1-abstract-short" style="display: inline;"> Multi-UAV air combat is a complex task involving multiple autonomous UAVs, an evolving field in both aerospace and artificial intelligence. This paper aims to enhance adversarial performance through collaborative strategies. Previous approaches predominantly discretize the action space into predefined actions, limiting UAV maneuverability and complex strategy implementation. Others simplify the pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13132v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13132v1-abstract-full" style="display: none;"> Multi-UAV air combat is a complex task involving multiple autonomous UAVs, an evolving field in both aerospace and artificial intelligence. This paper aims to enhance adversarial performance through collaborative strategies. Previous approaches predominantly discretize the action space into predefined actions, limiting UAV maneuverability and complex strategy implementation. Others simplify the problem to 1v1 combat, neglecting the cooperative dynamics among multiple UAVs. To address the high-dimensional challenges inherent in six-degree-of-freedom space and improve cooperation, we propose a hierarchical framework utilizing the Leader-Follower Multi-Agent Proximal Policy Optimization (LFMAPPO) strategy. Specifically, the framework is structured into three levels. The top level conducts a macro-level assessment of the environment and guides execution policy. The middle level determines the angle of the desired action. The bottom level generates precise action commands for the high-dimensional action space. Moreover, we optimize the state-value functions by assigning distinct roles with the leader-follower strategy to train the top-level policy, followers estimate the leader's utility, promoting effective cooperation among agents. Additionally, the incorporation of a target selector, aligned with the UAVs' posture, assesses the threat level of targets. Finally, simulation experiments validate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13132v1-abstract-full').style.display = 'none'; document.getElementById('2501.13132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13042">arXiv:2501.13042</a> <span> [<a href="https://arxiv.org/pdf/2501.13042">pdf</a>, <a href="https://arxiv.org/format/2501.13042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Does Table Source Matter? Benchmarking and Improving Multimodal Scientific Table Understanding and Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bohao Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yingji Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Freitas%2C+A">Andr茅 Freitas</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13042v2-abstract-short" style="display: inline;"> Recent large language models (LLMs) have advanced table understanding capabilities but rely on converting tables into text sequences. While multimodal large language models (MLLMs) enable direct visual processing, they face limitations in handling scientific tables due to fixed input image resolutions and insufficient numerical reasoning capabilities. We present a comprehensive framework for multi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13042v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13042v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13042v2-abstract-full" style="display: none;"> Recent large language models (LLMs) have advanced table understanding capabilities but rely on converting tables into text sequences. While multimodal large language models (MLLMs) enable direct visual processing, they face limitations in handling scientific tables due to fixed input image resolutions and insufficient numerical reasoning capabilities. We present a comprehensive framework for multimodal scientific table understanding and reasoning with dynamic input image resolutions. Our framework consists of three key components: (1) MMSci-Pre, a domain-specific table structure learning dataset of 52K scientific table structure recognition samples, (2) MMSci-Ins, an instruction tuning dataset with 12K samples across three table-based tasks, and (3) MMSci-Eval, a benchmark with 3,114 testing samples specifically designed to evaluate numerical reasoning capabilities. Extensive experiments demonstrate that our domain-specific approach with 52K scientific table images achieves superior performance compared to 150K general-domain tables, highlighting the importance of data quality over quantity. Our proposed table-based MLLMs with dynamic input resolutions show significant improvements in both general table understanding and numerical reasoning capabilities, with strong generalisation to held-out datasets. Our code and data are publicly available at https://github.com/Bernard-Yang/MMSci_Table. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13042v2-abstract-full').style.display = 'none'; document.getElementById('2501.13042v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12407">arXiv:2501.12407</a> <span> [<a href="https://arxiv.org/pdf/2501.12407">pdf</a>, <a href="https://arxiv.org/format/2501.12407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> The Streaming Batch Model for Efficient and Fault-Tolerant Heterogeneous Execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+F+S">Frank Sifei Luan</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Ziming Mao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R+Y">Ron Yifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Charlotte Lin</a>, <a href="/search/cs?searchtype=author&query=Kamsetty%2C+A">Amog Kamsetty</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+C">Cheng Su</a>, <a href="/search/cs?searchtype=author&query=Veeramani%2C+B">Balaji Veeramani</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S">Scott Lee</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+S">SangBin Cho</a>, <a href="/search/cs?searchtype=author&query=Zinzow%2C+C">Clark Zinzow</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+E">Eric Liang</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Stephanie Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12407v4-abstract-short" style="display: inline;"> While ML model training and inference are both GPU-intensive, CPU-based data processing is often the bottleneck. Distributed data processing systems based on the batch or stream processing models assume homogeneous resource requirements. They excel at CPU-based computation but either under-utilize heterogeneous resources or impose high overheads on failure and reconfiguration. We introduce the str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12407v4-abstract-full').style.display = 'inline'; document.getElementById('2501.12407v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12407v4-abstract-full" style="display: none;"> While ML model training and inference are both GPU-intensive, CPU-based data processing is often the bottleneck. Distributed data processing systems based on the batch or stream processing models assume homogeneous resource requirements. They excel at CPU-based computation but either under-utilize heterogeneous resources or impose high overheads on failure and reconfiguration. We introduce the streaming batch model, a hybrid of the two models that enables efficient and fault-tolerant heterogeneous execution. The key idea is to execute one partition at a time to allow lineage-based recovery with dynamic resource allocation. This enables memory-efficient pipelining across heterogeneous resources, similar to stream processing, but also offers the elasticity and fault tolerance properties of batch processing. We present Ray Data, an implementation of the streaming batch model that improves throughput on heterogeneous batch inference pipelines by 3--8$\times$ compared to traditional batch and stream processing systems. When training Stable Diffusion, Ray Data matches the throughput of single-node ML data loaders while additionally leveraging distributed heterogeneous clusters to further improve training throughput by 31%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12407v4-abstract-full').style.display = 'none'; document.getElementById('2501.12407v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12393">arXiv:2501.12393</a> <span> [<a href="https://arxiv.org/pdf/2501.12393">pdf</a>, <a href="https://arxiv.org/format/2501.12393">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Affordance-Aware Articulation Synthesis for Rigged Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yu-Chu Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C+H">Chieh Hubert Lin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hsin-Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12393v1-abstract-short" style="display: inline;"> Rigged objects are commonly used in artist pipelines, as they can flexibly adapt to different scenes and postures. However, articulating the rigs into realistic affordance-aware postures (e.g., following the context, respecting the physics and the personalities of the object) remains time-consuming and heavily relies on human labor from experienced artists. In this paper, we tackle the novel probl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12393v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12393v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12393v1-abstract-full" style="display: none;"> Rigged objects are commonly used in artist pipelines, as they can flexibly adapt to different scenes and postures. However, articulating the rigs into realistic affordance-aware postures (e.g., following the context, respecting the physics and the personalities of the object) remains time-consuming and heavily relies on human labor from experienced artists. In this paper, we tackle the novel problem and design A3Syn. With a given context, such as the environment mesh and a text prompt of the desired posture, A3Syn synthesizes articulation parameters for arbitrary and open-domain rigged objects obtained from the Internet. The task is incredibly challenging due to the lack of training data, and we do not make any topological assumptions about the open-domain rigs. We propose using 2D inpainting diffusion model and several control techniques to synthesize in-context affordance information. Then, we develop an efficient bone correspondence alignment using a combination of differentiable rendering and semantic correspondence. A3Syn has stable convergence, completes in minutes, and synthesizes plausible affordance on different combinations of in-the-wild object rigs and scenes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12393v1-abstract-full').style.display = 'none'; document.getElementById('2501.12393v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://chuyu.org/research/a3syn</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12136">arXiv:2501.12136</a> <span> [<a href="https://arxiv.org/pdf/2501.12136">pdf</a>, <a href="https://arxiv.org/format/2501.12136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Federated Learning Systems for Time-Series Power Consumption Prediction with Multi-Head Embedding Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Syu%2C+J">Jia-Hao Syu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+C">Jerry Chun-Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+G">Gautam Srivastava</a>, <a href="/search/cs?searchtype=author&query=Yun%2C+U">Unil Yun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12136v1-abstract-short" style="display: inline;"> Time-series prediction is increasingly popular in a variety of applications, such as smart factories and smart transportation. Researchers have used various techniques to predict power consumption, but existing models lack discussion of collaborative learning and privacy issues among multiple clients. To address these issues, we propose Multi-Head Heterogeneous Federated Learning (MHHFL) systems t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12136v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12136v1-abstract-full" style="display: none;"> Time-series prediction is increasingly popular in a variety of applications, such as smart factories and smart transportation. Researchers have used various techniques to predict power consumption, but existing models lack discussion of collaborative learning and privacy issues among multiple clients. To address these issues, we propose Multi-Head Heterogeneous Federated Learning (MHHFL) systems that consist of multiple head networks, which independently act as carriers for federated learning. In the federated period, each head network is embedded into 2-dimensional vectors and shared with the centralized source pool. MHHFL then selects appropriate source networks and blends the head networks as knowledge transfer in federated learning. The experimental results show that the proposed MHHFL systems significantly outperform the benchmark and state-of-the-art systems and reduce the prediction error by 24.9% to 94.1%. The ablation studies demonstrate the effectiveness of the proposed mechanisms in the MHHFL (head network embedding and selection mechanisms), which significantly outperforms traditional federated average and random transfer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12136v1-abstract-full').style.display = 'none'; document.getElementById('2501.12136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12133">arXiv:2501.12133</a> <span> [<a href="https://arxiv.org/pdf/2501.12133">pdf</a>, <a href="https://arxiv.org/format/2501.12133">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Distributed Multi-Head Learning Systems for Power Consumption Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Syu%2C+J">Jia-Hao Syu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+C">Jerry Chun-Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12133v1-abstract-short" style="display: inline;"> As more and more automatic vehicles, power consumption prediction becomes a vital issue for task scheduling and energy management. Most research focuses on automatic vehicles in transportation, but few focus on automatic ground vehicles (AGVs) in smart factories, which face complex environments and generate large amounts of data. There is an inevitable trade-off between feature diversity and inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12133v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12133v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12133v1-abstract-full" style="display: none;"> As more and more automatic vehicles, power consumption prediction becomes a vital issue for task scheduling and energy management. Most research focuses on automatic vehicles in transportation, but few focus on automatic ground vehicles (AGVs) in smart factories, which face complex environments and generate large amounts of data. There is an inevitable trade-off between feature diversity and interference. In this paper, we propose Distributed Multi-Head learning (DMH) systems for power consumption prediction in smart factories. Multi-head learning mechanisms are proposed in DMH to reduce noise interference and improve accuracy. Additionally, DMH systems are designed as distributed and split learning, reducing the client-to-server transmission cost, sharing knowledge without sharing local data and models, and enhancing the privacy and security levels. Experimental results show that the proposed DMH systems rank in the top-2 on most datasets and scenarios. DMH-E system reduces the error of the state-of-the-art systems by 14.5% to 24.0%. Effectiveness studies demonstrate the effectiveness of Pearson correlation-based feature engineering, and feature grouping with the proposed multi-head learning further enhances prediction performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12133v1-abstract-full').style.display = 'none'; document.getElementById('2501.12133v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12125">arXiv:2501.12125</a> <span> [<a href="https://arxiv.org/pdf/2501.12125">pdf</a>, <a href="https://arxiv.org/format/2501.12125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Heterogeneous Federated Learning System for Sparse Healthcare Time-Series Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Syu%2C+J">Jia-Hao Syu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+C">Jerry Chun-Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12125v1-abstract-short" style="display: inline;"> In this paper, we propose a heterogeneous federated learning (HFL) system for sparse time series prediction in healthcare, which is a decentralized federated learning algorithm with heterogeneous transfers. We design dense and sparse feature tensors to deal with the sparsity of data sources. Heterogeneous federated learning is developed to share asynchronous parts of networks and select appropriat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12125v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12125v1-abstract-full" style="display: none;"> In this paper, we propose a heterogeneous federated learning (HFL) system for sparse time series prediction in healthcare, which is a decentralized federated learning algorithm with heterogeneous transfers. We design dense and sparse feature tensors to deal with the sparsity of data sources. Heterogeneous federated learning is developed to share asynchronous parts of networks and select appropriate models for knowledge transfer. Experimental results show that the proposed HFL achieves the lowest prediction error among all benchmark systems on eight out of ten prediction tasks, with MSE reduction of 94.8%, 48.3%, and 52.1% compared to the benchmark systems. These results demonstrate the effectiveness of HFL in transferring knowledge from heterogeneous domains, especially in the smaller target domain. Ablation studies then demonstrate the effectiveness of the designed mechanisms for heterogeneous domain selection and switching in predicting healthcare time series with privacy, model security, and heterogeneous knowledge transfer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12125v1-abstract-full').style.display = 'none'; document.getElementById('2501.12125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11895">arXiv:2501.11895</a> <span> [<a href="https://arxiv.org/pdf/2501.11895">pdf</a>, <a href="https://arxiv.org/format/2501.11895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/SMC54092.2024.10831598">10.1109/SMC54092.2024.10831598 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Contrastive Masked Autoencoders for Character-Level Open-Set Writer Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaowei Jiang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenhao Ma</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yiqun Duan</a>, <a href="/search/cs?searchtype=author&query=Do%2C+T">Thomas Do</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Teng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11895v1-abstract-short" style="display: inline;"> In the realm of digital forensics and document authentication, writer identification plays a crucial role in determining the authors of documents based on handwriting styles. The primary challenge in writer-id is the "open-set scenario", where the goal is accurately recognizing writers unseen during the model training. To overcome this challenge, representation learning is the key. This method can… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11895v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11895v1-abstract-full" style="display: none;"> In the realm of digital forensics and document authentication, writer identification plays a crucial role in determining the authors of documents based on handwriting styles. The primary challenge in writer-id is the "open-set scenario", where the goal is accurately recognizing writers unseen during the model training. To overcome this challenge, representation learning is the key. This method can capture unique handwriting features, enabling it to recognize styles not previously encountered during training. Building on this concept, this paper introduces the Contrastive Masked Auto-Encoders (CMAE) for Character-level Open-Set Writer Identification. We merge Masked Auto-Encoders (MAE) with Contrastive Learning (CL) to simultaneously and respectively capture sequential information and distinguish diverse handwriting styles. Demonstrating its effectiveness, our model achieves state-of-the-art (SOTA) results on the CASIA online handwriting dataset, reaching an impressive precision rate of 89.7%. Our study advances universal writer-id with a sophisticated representation learning approach, contributing substantially to the ever-evolving landscape of digital handwriting analysis, and catering to the demands of an increasingly interconnected world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11895v1-abstract-full').style.display = 'none'; document.getElementById('2501.11895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10963">arXiv:2501.10963</a> <span> [<a href="https://arxiv.org/pdf/2501.10963">pdf</a>, <a href="https://arxiv.org/format/2501.10963">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Open FinLLM Leaderboard: Towards Financial AI Readiness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+S+C">Shengyuan Colin Lin</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+F">Felix Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Keyi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xingjian Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jimin Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qianqian Xie</a>, <a href="/search/cs?searchtype=author&query=Borella%2C+L">Luca Borella</a>, <a href="/search/cs?searchtype=author&query=White%2C+M">Matt White</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C+D">Christina Dan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kairong Xiao</a>, <a href="/search/cs?searchtype=author&query=Yanglet%2C+X+L">Xiao-Yang Liu Yanglet</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+L">Li Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10963v1-abstract-short" style="display: inline;"> Financial large language models (FinLLMs) with multimodal capabilities are envisioned to revolutionize applications across business, finance, accounting, and auditing. However, real-world adoption requires robust benchmarks of FinLLMs' and agents' performance. Maintaining an open leaderboard of models is crucial for encouraging innovative adoption and improving model effectiveness. In collaboratio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10963v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10963v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10963v1-abstract-full" style="display: none;"> Financial large language models (FinLLMs) with multimodal capabilities are envisioned to revolutionize applications across business, finance, accounting, and auditing. However, real-world adoption requires robust benchmarks of FinLLMs' and agents' performance. Maintaining an open leaderboard of models is crucial for encouraging innovative adoption and improving model effectiveness. In collaboration with Linux Foundation and Hugging Face, we create an open FinLLM leaderboard, which serves as an open platform for assessing and comparing LLMs' performance on a wide spectrum of financial tasks. By demoncratizing access to advanced AI tools and financial knowledge, a chatbot or agent may enhance the analytical capabilities of the general public to a professional-level within a few months of usage. This open leaderboard welcomes contributions from academia, open-source community, industry, and stakeholders. In particular, we encourage contributions of new datasets, tasks, and models for continual update. Through fostering a collaborative and open ecosystem, we seek to ensure the long-term sustainability and relevance of LLMs and agents as they evolve with the financial sector's needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10963v1-abstract-full').style.display = 'none'; document.getElementById('2501.10963v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09980">arXiv:2501.09980</a> <span> [<a href="https://arxiv.org/pdf/2501.09980">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Aneumo: A Large-Scale Comprehensive Synthetic Dataset of Aneurysm Hemodynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xigui Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanye Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+F">Feiyang Xiao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xin Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chen Jiang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jianchao Ge</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiansheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qimeng Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Taiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chensen Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09980v1-abstract-short" style="display: inline;"> Intracranial aneurysm (IA) is a common cerebrovascular disease that is usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if ruptured. Although clinical practice is usually based on individual factors and morphological features of the aneurysm, its pathophysiology and hemodynamic mechanisms remain controversial. To address the limitations of current research, this study constr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09980v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09980v1-abstract-full" style="display: none;"> Intracranial aneurysm (IA) is a common cerebrovascular disease that is usually asymptomatic but may cause severe subarachnoid hemorrhage (SAH) if ruptured. Although clinical practice is usually based on individual factors and morphological features of the aneurysm, its pathophysiology and hemodynamic mechanisms remain controversial. To address the limitations of current research, this study constructed a comprehensive hemodynamic dataset of intracranial aneurysms. The dataset is based on 466 real aneurysm models, and 10,000 synthetic models were generated by resection and deformation operations, including 466 aneurysm-free models and 9,534 deformed aneurysm models. The dataset also provides medical image-like segmentation mask files to support insightful analysis. In addition, the dataset contains hemodynamic data measured at eight steady-state flow rates (0.001 to 0.004 kg/s), including critical parameters such as flow velocity, pressure, and wall shear stress, providing a valuable resource for investigating aneurysm pathogenesis and clinical prediction. This dataset will help advance the understanding of the pathologic features and hemodynamic mechanisms of intracranial aneurysms and support in-depth research in related fields. Dataset hosted at https://github.com/Xigui-Li/Aneumo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09980v1-abstract-full').style.display = 'none'; document.getElementById('2501.09980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09354">arXiv:2501.09354</a> <span> [<a href="https://arxiv.org/pdf/2501.09354">pdf</a>, <a href="https://arxiv.org/format/2501.09354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Style4Rec: Enhancing Transformer-based E-commerce Recommendation Systems with Style and Shopping Cart Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ugurlu%2C+B">Berke Ugurlu</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M">Ming-Yi Hong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Che Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09354v1-abstract-short" style="display: inline;"> Understanding users' product preferences is essential to the efficacy of a recommendation system. Precision marketing leverages users' historical data to discern these preferences and recommends products that align with them. However, recent browsing and purchase records might better reflect current purchasing inclinations. Transformer-based recommendation systems have made strides in sequential r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09354v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09354v1-abstract-full" style="display: none;"> Understanding users' product preferences is essential to the efficacy of a recommendation system. Precision marketing leverages users' historical data to discern these preferences and recommends products that align with them. However, recent browsing and purchase records might better reflect current purchasing inclinations. Transformer-based recommendation systems have made strides in sequential recommendation tasks, but they often fall short in utilizing product image style information and shopping cart data effectively. In light of this, we propose Style4Rec, a transformer-based e-commerce recommendation system that harnesses style and shopping cart information to enhance existing transformer-based sequential product recommendation systems. Style4Rec represents a significant step forward in personalized e-commerce recommendations, outperforming benchmarks across various evaluation metrics. Style4Rec resulted in notable improvements: HR@5 increased from 0.681 to 0.735, NDCG@5 increased from 0.594 to 0.674, and MRR@5 increased from 0.559 to 0.654. We tested our model using an e-commerce dataset from our partnering company and found that it exceeded established transformer-based sequential recommendation benchmarks across various evaluation metrics. Thus, Style4Rec presents a significant step forward in personalized e-commerce recommendation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09354v1-abstract-full').style.display = 'none'; document.getElementById('2501.09354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 images, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09285">arXiv:2501.09285</a> <span> [<a href="https://arxiv.org/pdf/2501.09285">pdf</a>, <a href="https://arxiv.org/ps/2501.09285">ps</a>, <a href="https://arxiv.org/format/2501.09285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Graded Courrent PDL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chun-Yu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09285v1-abstract-short" style="display: inline;"> Propositional Dynamic Logic, PDL, is a modal logic designed to formalize the reasoning about programs. By extending accessibility between states to states and state sets, concurrent propositional dynamic logic CPDL, is introduced to include concurrent programs due to Peleg and Goldblatt. We study a many-valued generalization of CPDL where the satisfiability and the reachability relation between st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09285v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09285v1-abstract-full" style="display: none;"> Propositional Dynamic Logic, PDL, is a modal logic designed to formalize the reasoning about programs. By extending accessibility between states to states and state sets, concurrent propositional dynamic logic CPDL, is introduced to include concurrent programs due to Peleg and Goldblatt. We study a many-valued generalization of CPDL where the satisfiability and the reachability relation between states and state sets are graded over a finite 艁ukasiewicz chain. Finitely-valued dynamic logic has been shown to be useful in formalizing reasoning about program behaviors under uncertainty. We obtain completeness results for all finitely valued PDL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09285v1-abstract-full').style.display = 'none'; document.getElementById('2501.09285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">For conference submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07279">arXiv:2501.07279</a> <span> [<a href="https://arxiv.org/pdf/2501.07279">pdf</a>, <a href="https://arxiv.org/format/2501.07279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Toward Universal Decoding of Binary Linear Block Codes via Enhanced Polar Transformations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chien-Ying Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yu-Chih Huang</a>, <a href="/search/cs?searchtype=author&query=Shieh%2C+S">Shin-Lin Shieh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Po-Ning Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07279v1-abstract-short" style="display: inline;"> Binary linear block codes (BLBCs) are essential to modern communication, but their diverse structures often require multiple decoders, increasing complexity. This work introduces enhanced polar decoding ($\mathsf{PD}^+$), a universal soft decoding algorithm that transforms any BLBC into a polar-like code compatible with efficient polar code decoders such as successive cancellation list (SCL) decod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07279v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07279v1-abstract-full" style="display: none;"> Binary linear block codes (BLBCs) are essential to modern communication, but their diverse structures often require multiple decoders, increasing complexity. This work introduces enhanced polar decoding ($\mathsf{PD}^+$), a universal soft decoding algorithm that transforms any BLBC into a polar-like code compatible with efficient polar code decoders such as successive cancellation list (SCL) decoding. Key innovations in $\mathsf{PD}^+$ include pruning polar kernels, shortening codes, and leveraging a simulated annealing algorithm to optimize transformations. These enable $\mathsf{PD}^+$ to achieve competitive or superior performance to state-of-the-art algorithms like OSD and GRAND across various codes, including extended BCH, extended Golay, and binary quadratic residue codes, with significantly lower complexity. Moreover, $\mathsf{PD}^+$ is designed to be forward-compatible with advancements in polar code decoding techniques and AI-driven search methods, making it a robust and versatile solution for universal BLBC decoding in both present and future systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07279v1-abstract-full').style.display = 'none'; document.getElementById('2501.07279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05464">arXiv:2501.05464</a> <span> [<a href="https://arxiv.org/pdf/2501.05464">pdf</a>, <a href="https://arxiv.org/format/2501.05464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLM-MedQA: Enhancing Medical Question Answering through Case Studies in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hui Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yineng Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Ching-Sheng Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinrong Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05464v2-abstract-short" style="display: inline;"> Accurate and efficient question-answering systems are essential for delivering high-quality patient care in the medical field. While Large Language Models (LLMs) have made remarkable strides across various domains, they continue to face significant challenges in medical question answering, particularly in understanding domain-specific terminologies and performing complex reasoning. These limitatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05464v2-abstract-full').style.display = 'inline'; document.getElementById('2501.05464v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05464v2-abstract-full" style="display: none;"> Accurate and efficient question-answering systems are essential for delivering high-quality patient care in the medical field. While Large Language Models (LLMs) have made remarkable strides across various domains, they continue to face significant challenges in medical question answering, particularly in understanding domain-specific terminologies and performing complex reasoning. These limitations undermine their effectiveness in critical medical applications. To address these issues, we propose a novel approach incorporating similar case generation within a multi-agent medical question-answering (MedQA) system. Specifically, we leverage the Llama3.1:70B model, a state-of-the-art LLM, in a multi-agent architecture to enhance performance on the MedQA dataset using zero-shot learning. Our method capitalizes on the model's inherent medical knowledge and reasoning capabilities, eliminating the need for additional training data. Experimental results show substantial performance gains over existing benchmark models, with improvements of 7% in both accuracy and F1-score across various medical QA tasks. Furthermore, we examine the model's interpretability and reliability in addressing complex medical queries. This research not only offers a robust solution for medical question answering but also establishes a foundation for broader applications of LLMs in the medical domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05464v2-abstract-full').style.display = 'none'; document.getElementById('2501.05464v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05224">arXiv:2501.05224</a> <span> [<a href="https://arxiv.org/pdf/2501.05224">pdf</a>, <a href="https://arxiv.org/format/2501.05224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Large Language Models for Zero-shot Lay Summarisation in Biomedicine and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goldsack%2C+T">Tomas Goldsack</a>, <a href="/search/cs?searchtype=author&query=Scarton%2C+C">Carolina Scarton</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05224v1-abstract-short" style="display: inline;"> In this work, we explore the application of Large Language Models to zero-shot Lay Summarisation. We propose a novel two-stage framework for Lay Summarisation based on real-life processes, and find that summaries generated with this method are increasingly preferred by human judges for larger models. To help establish best practices for employing LLMs in zero-shot settings, we also assess the abil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05224v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05224v1-abstract-full" style="display: none;"> In this work, we explore the application of Large Language Models to zero-shot Lay Summarisation. We propose a novel two-stage framework for Lay Summarisation based on real-life processes, and find that summaries generated with this method are increasingly preferred by human judges for larger models. To help establish best practices for employing LLMs in zero-shot settings, we also assess the ability of LLMs as judges, finding that they are able to replicate the preferences of human judges. Finally, we take the initial steps towards Lay Summarisation for Natural Language Processing (NLP) articles, finding that LLMs are able to generalise to this new domain, and further highlighting the greater utility of summaries generated by our proposed approach via an in-depth human evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05224v1-abstract-full').style.display = 'none'; document.getElementById('2501.05224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03847">arXiv:2501.03847</a> <span> [<a href="https://arxiv.org/pdf/2501.03847">pdf</a>, <a href="https://arxiv.org/format/2501.03847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Diffusion as Shader: 3D-aware Video Diffusion for Versatile Video Generation Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zekai Gu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiahao Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Si%2C+C">Chenyang Si</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhen Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qifeng Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03847v2-abstract-short" style="display: inline;"> Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03847v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03847v2-abstract-full" style="display: none;"> Diffusion models have demonstrated impressive performance in generating high-quality videos from text prompts or images. However, precise control over the video generation process, such as camera manipulation or content editing, remains a significant challenge. Existing methods for controlled video generation are typically limited to a single control type, lacking the flexibility to handle diverse control demands. In this paper, we introduce Diffusion as Shader (DaS), a novel approach that supports multiple video control tasks within a unified architecture. Our key insight is that achieving versatile video control necessitates leveraging 3D control signals, as videos are fundamentally 2D renderings of dynamic 3D content. Unlike prior methods limited to 2D control signals, DaS leverages 3D tracking videos as control inputs, making the video diffusion process inherently 3D-aware. This innovation allows DaS to achieve a wide range of video controls by simply manipulating the 3D tracking videos. A further advantage of using 3D tracking videos is their ability to effectively link frames, significantly enhancing the temporal consistency of the generated videos. With just 3 days of fine-tuning on 8 H800 GPUs using less than 10k videos, DaS demonstrates strong control capabilities across diverse tasks, including mesh-to-video generation, camera control, motion transfer, and object manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03847v2-abstract-full').style.display = 'none'; document.getElementById('2501.03847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://igl-hkust.github.io/das/ Codes: https://github.com/IGL-HKUST/DiffusionAsShader</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03575">arXiv:2501.03575</a> <span> [<a href="https://arxiv.org/pdf/2501.03575">pdf</a>, <a href="https://arxiv.org/format/2501.03575">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Cosmos World Foundation Model Platform for Physical AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=NVIDIA"> NVIDIA</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+N">Niket Agarwal</a>, <a href="/search/cs?searchtype=author&query=Ali%2C+A">Arslan Ali</a>, <a href="/search/cs?searchtype=author&query=Bala%2C+M">Maciej Bala</a>, <a href="/search/cs?searchtype=author&query=Balaji%2C+Y">Yogesh Balaji</a>, <a href="/search/cs?searchtype=author&query=Barker%2C+E">Erik Barker</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+T">Tiffany Cai</a>, <a href="/search/cs?searchtype=author&query=Chattopadhyay%2C+P">Prithvijit Chattopadhyay</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongxin Chen</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Dworakowski%2C+D">Daniel Dworakowski</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jiaojiao Fan</a>, <a href="/search/cs?searchtype=author&query=Fenzi%2C+M">Michele Fenzi</a>, <a href="/search/cs?searchtype=author&query=Ferroni%2C+F">Francesco Ferroni</a>, <a href="/search/cs?searchtype=author&query=Fidler%2C+S">Sanja Fidler</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+D">Dieter Fox</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+S">Songwei Ge</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunhao Ge</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/cs?searchtype=author&query=Gururani%2C+S">Siddharth Gururani</a>, <a href="/search/cs?searchtype=author&query=He%2C+E">Ethan He</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiahui Huang</a>, <a href="/search/cs?searchtype=author&query=Huffman%2C+J">Jacob Huffman</a> , et al. (54 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03575v1-abstract-short" style="display: inline;"> Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into cu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03575v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03575v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03575v1-abstract-full" style="display: none;"> Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03575v1-abstract-full').style.display = 'none'; document.getElementById('2501.03575v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03360">arXiv:2501.03360</a> <span> [<a href="https://arxiv.org/pdf/2501.03360">pdf</a>, <a href="https://arxiv.org/format/2501.03360">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2024.3517459">10.1109/TGRS.2024.3517459 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Quantum Feature-Empowered Deep Classification for Fast Mangrove Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chia-Hsiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+P">Po-Wei Tang</a>, <a href="/search/cs?searchtype=author&query=Huete%2C+A+R">Alfredo R. Huete</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03360v1-abstract-short" style="display: inline;"> A mangrove mapping (MM) algorithm is an essential classification tool for environmental monitoring. The recent literature shows that compared with other index-based MM methods that treat pixels as spatially independent, convolutional neural networks (CNNs) are crucial for leveraging spatial continuity information, leading to improved classification performance. In this work, we go a step further t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03360v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03360v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03360v1-abstract-full" style="display: none;"> A mangrove mapping (MM) algorithm is an essential classification tool for environmental monitoring. The recent literature shows that compared with other index-based MM methods that treat pixels as spatially independent, convolutional neural networks (CNNs) are crucial for leveraging spatial continuity information, leading to improved classification performance. In this work, we go a step further to show that quantum features provide radically new information for CNN to further upgrade the classification results. Simply speaking, CNN computes affine-mapping features, while quantum neural network (QNN) offers unitary-computing features, thereby offering a fresh perspective in the final decision-making (classification). To address the challenging MM problem, we design an entangled spatial-spectral quantum feature extraction module. Notably, to ensure that the quantum features contribute genuinely novel information (unaffected by traditional CNN features), we design a separate network track consisting solely of quantum neurons with built-in interpretability. The extracted pure quantum information is then fused with traditional feature information to jointly make the final decision. The proposed quantum-empowered deep network (QEDNet) is very lightweight, so the improvement does come from the cooperation between CNN and QNN (rather than parameter augmentation). Extensive experiments will be conducted to demonstrate the superiority of QEDNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03360v1-abstract-full').style.display = 'none'; document.getElementById('2501.03360v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This work has been accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02654">arXiv:2501.02654</a> <span> [<a href="https://arxiv.org/pdf/2501.02654">pdf</a>, <a href="https://arxiv.org/format/2501.02654">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Tougher Text, Smarter Models: Raising the Bar for Adversarial Defence Benchmarks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenghua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02654v2-abstract-short" style="display: inline;"> Recent advancements in natural language processing have highlighted the vulnerability of deep learning models to adversarial attacks. While various defence mechanisms have been proposed, there is a lack of comprehensive benchmarks that evaluate these defences across diverse datasets, models, and tasks. In this work, we address this gap by presenting an extensive benchmark for textual adversarial d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02654v2-abstract-full').style.display = 'inline'; document.getElementById('2501.02654v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02654v2-abstract-full" style="display: none;"> Recent advancements in natural language processing have highlighted the vulnerability of deep learning models to adversarial attacks. While various defence mechanisms have been proposed, there is a lack of comprehensive benchmarks that evaluate these defences across diverse datasets, models, and tasks. In this work, we address this gap by presenting an extensive benchmark for textual adversarial defence that significantly expands upon previous work. Our benchmark incorporates a wide range of datasets, evaluates state-of-the-art defence mechanisms, and extends the assessment to include critical tasks such as single-sentence classification, similarity and paraphrase identification, natural language inference, and commonsense reasoning. This work not only serves as a valuable resource for researchers and practitioners in the field of adversarial robustness but also identifies key areas for future research in textual adversarial defence. By establishing a new standard for benchmarking in this domain, we aim to accelerate progress towards more robust and reliable natural language processing systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02654v2-abstract-full').style.display = 'none'; document.getElementById('2501.02654v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Will be presented as an oral in-person presentation at the conference of COLING 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02373">arXiv:2501.02373</a> <span> [<a href="https://arxiv.org/pdf/2501.02373">pdf</a>, <a href="https://arxiv.org/format/2501.02373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BADTV: Unveiling Backdoor Threats in Third-Party Task Vectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chia-Yi Hsu</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+Y">Yu-Lin Tsai</a>, <a href="/search/cs?searchtype=author&query=Zhe%2C+Y">Yu Zhe</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan-Lun Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chih-Hsun Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chia-Mu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chun-Ying Huang</a>, <a href="/search/cs?searchtype=author&query=Sakuma%2C+J">Jun Sakuma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02373v1-abstract-short" style="display: inline;"> Task arithmetic in large-scale pre-trained models enables flexible adaptation to diverse downstream tasks without extensive re-training. By leveraging task vectors (TVs), users can perform modular updates to pre-trained models through simple arithmetic operations like addition and subtraction. However, this flexibility introduces new security vulnerabilities. In this paper, we identify and evaluat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02373v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02373v1-abstract-full" style="display: none;"> Task arithmetic in large-scale pre-trained models enables flexible adaptation to diverse downstream tasks without extensive re-training. By leveraging task vectors (TVs), users can perform modular updates to pre-trained models through simple arithmetic operations like addition and subtraction. However, this flexibility introduces new security vulnerabilities. In this paper, we identify and evaluate the susceptibility of TVs to backdoor attacks, demonstrating how malicious actors can exploit TVs to compromise model integrity. By developing composite backdoors and eliminating redudant clean tasks, we introduce BadTV, a novel backdoor attack specifically designed to remain effective under task learning, forgetting, and analogies operations. Our extensive experiments reveal that BadTV achieves near-perfect attack success rates across various scenarios, significantly impacting the security of models using task arithmetic. We also explore existing defenses, showing that current methods fail to detect or mitigate BadTV. Our findings highlight the need for robust defense mechanisms to secure TVs in real-world applications, especially as TV services become more popular in machine-learning ecosystems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02373v1-abstract-full').style.display = 'none'; document.getElementById('2501.02373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository