Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 884 results for author: <span class="mathjax">Hu, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Hu%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hu, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hu%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hu, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10119">arXiv:2502.10119</a> <span> [<a href="https://arxiv.org/pdf/2502.10119">pdf</a>, <a href="https://arxiv.org/format/2502.10119">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SeWA: Selective Weight Average via Probabilistic Masking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengchao Hu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zerui Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoxia Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+D">Dianhai Yu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Li Shen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Quan Zheng</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+D">Dacheng Tao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10119v1-abstract-short" style="display: inline;"> Weight averaging has become a standard technique for enhancing model performance. However, methods such as Stochastic Weight Averaging (SWA) and Latest Weight Averaging (LAWA) often require manually designed procedures to sample from the training trajectory, and the results depend heavily on hyperparameter tuning. To minimize human effort, this paper proposes a simple yet efficient algorithm calle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10119v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10119v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10119v1-abstract-full" style="display: none;"> Weight averaging has become a standard technique for enhancing model performance. However, methods such as Stochastic Weight Averaging (SWA) and Latest Weight Averaging (LAWA) often require manually designed procedures to sample from the training trajectory, and the results depend heavily on hyperparameter tuning. To minimize human effort, this paper proposes a simple yet efficient algorithm called Selective Weight Averaging (SeWA), which adaptively selects checkpoints during the final stages of training for averaging. Based on SeWA, we show that only a few points are needed to achieve better generalization and faster convergence. Theoretically, solving the discrete subset selection problem is inherently challenging. To address this, we transform it into a continuous probabilistic optimization framework and employ the Gumbel-Softmax estimator to learn the non-differentiable mask for each checkpoint. Further, we theoretically derive the SeWA's stability-based generalization bounds, which are sharper than that of SGD under both convex and non-convex assumptions. Finally, solid extended experiments in various domains, including behavior cloning, image classification, and text classification, further validate the effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10119v1-abstract-full').style.display = 'none'; document.getElementById('2502.10119v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09291">arXiv:2502.09291</a> <span> [<a href="https://arxiv.org/pdf/2502.09291">pdf</a>, <a href="https://arxiv.org/ps/2502.09291">ps</a>, <a href="https://arxiv.org/format/2502.09291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Joint Attention Mechanism Learning to Facilitate Opto-physiological Monitoring during Physical Activity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaoyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sijung Hu</a>, <a href="/search/cs?searchtype=author&query=Dwyer%2C+V">Vincent Dwyer</a>, <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M">Mahsa Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Barrett%2C+L">Laura Barrett</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09291v1-abstract-short" style="display: inline;"> Opto-physiological monitoring is a non-contact technique for measuring cardiac signals, i.e., photoplethysmography (PPG). Quality PPG signals directly lead to reliable physiological readings. However, PPG signal acquisition procedures are often accompanied by spurious motion artefacts (MAs), especially during low-to-high-intensity physical activity. This study proposes a practical adversarial lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09291v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09291v1-abstract-full" style="display: none;"> Opto-physiological monitoring is a non-contact technique for measuring cardiac signals, i.e., photoplethysmography (PPG). Quality PPG signals directly lead to reliable physiological readings. However, PPG signal acquisition procedures are often accompanied by spurious motion artefacts (MAs), especially during low-to-high-intensity physical activity. This study proposes a practical adversarial learning approach for opto-physiological monitoring by using a generative adversarial network with an attention mechanism (AM-GAN) to model motion noise and to allow MA removal. The AM-GAN learns an MA-resistant mapping from raw and noisy signals to clear PPG signals in an adversarial manner, guided by an attention mechanism to directly translate the motion reference of triaxial acceleration to the MAs appearing in the raw signal. The AM-GAN was experimented with three various protocols engaged with 39 subjects in various physical activities. The average absolute error for heart rate (HR) derived from the MA-free PPG signal via the AM-GAN, is 1.81 beats/min for the IEEE-SPC dataset and 3.86 beats/min for the PPGDalia dataset. The same procedure applied to an in-house LU dataset resulted in average absolute errors for HR and respiratory rate (RR) of less than 1.37 beats/min and 2.49 breaths/min, respectively. The study demonstrates the robustness and resilience of AM-GAN, particularly during low-to-high-intensity physical activities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09291v1-abstract-full').style.display = 'none'; document.getElementById('2502.09291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08788">arXiv:2502.08788</a> <span> [<a href="https://arxiv.org/pdf/2502.08788">pdf</a>, <a href="https://arxiv.org/format/2502.08788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> If Multi-Agent Debate is the Answer, What is the Question? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hangfan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhiyao Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinrun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiaosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Dinghao Wu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08788v1-abstract-short" style="display: inline;"> Multi-agent debate (MAD) has emerged as a promising approach to enhance the factual accuracy and reasoning quality of large language models (LLMs) by engaging multiple agents in iterative discussions during inference. Despite its potential, we argue that current MAD research suffers from critical shortcomings in evaluation practices, including limited dataset overlap and inconsistent baselines, ra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08788v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08788v1-abstract-full" style="display: none;"> Multi-agent debate (MAD) has emerged as a promising approach to enhance the factual accuracy and reasoning quality of large language models (LLMs) by engaging multiple agents in iterative discussions during inference. Despite its potential, we argue that current MAD research suffers from critical shortcomings in evaluation practices, including limited dataset overlap and inconsistent baselines, raising significant concerns about generalizability. Correspondingly, this paper presents a systematic evaluation of five representative MAD methods across nine benchmarks using four foundational models. Surprisingly, our findings reveal that MAD methods fail to reliably outperform simple single-agent baselines such as Chain-of-Thought and Self-Consistency, even when consuming additional inference-time computation. From our analysis, we found that model heterogeneity can significantly improve MAD frameworks. We propose Heter-MAD enabling a single LLM agent to access the output from heterogeneous foundation models, which boosts the performance of current MAD frameworks. Finally, we outline potential directions for advancing MAD, aiming to spark a broader conversation and inspire future work in this area. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08788v1-abstract-full').style.display = 'none'; document.getElementById('2502.08788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This position paper takes a critical view of the status quo of MAD research, and outline multiple potential directions to improve MAD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07807">arXiv:2502.07807</a> <span> [<a href="https://arxiv.org/pdf/2502.07807">pdf</a>, <a href="https://arxiv.org/format/2502.07807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CP-Guard+: A New Paradigm for Malicious Agent Detection and Defense in Collaborative Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yihang Tao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zihan Fang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guowen Xu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Kwong%2C+S">Sam Kwong</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07807v1-abstract-short" style="display: inline;"> Collaborative perception (CP) is a promising method for safe connected and autonomous driving, which enables multiple vehicles to share sensing information to enhance perception performance. However, compared with single-vehicle perception, the openness of a CP system makes it more vulnerable to malicious attacks that can inject malicious information to mislead the perception of an ego vehicle, re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07807v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07807v1-abstract-full" style="display: none;"> Collaborative perception (CP) is a promising method for safe connected and autonomous driving, which enables multiple vehicles to share sensing information to enhance perception performance. However, compared with single-vehicle perception, the openness of a CP system makes it more vulnerable to malicious attacks that can inject malicious information to mislead the perception of an ego vehicle, resulting in severe risks for safe driving. To mitigate such vulnerability, we first propose a new paradigm for malicious agent detection that effectively identifies malicious agents at the feature level without requiring verification of final perception results, significantly reducing computational overhead. Building on this paradigm, we introduce CP-GuardBench, the first comprehensive dataset provided to train and evaluate various malicious agent detection methods for CP systems. Furthermore, we develop a robust defense method called CP-Guard+, which enhances the margin between the representations of benign and malicious features through a carefully designed Dual-Centered Contrastive Loss (DCCLoss). Finally, we conduct extensive experiments on both CP-GuardBench and V2X-Sim, and demonstrate the superiority of CP-Guard+. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07807v1-abstract-full').style.display = 'none'; document.getElementById('2502.07807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07730">arXiv:2502.07730</a> <span> [<a href="https://arxiv.org/pdf/2502.07730">pdf</a>, <a href="https://arxiv.org/format/2502.07730">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DOGlove: Dexterous Manipulation with a Low-Cost Open-Source Haptic Force Feedback Glove </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songbo Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhecheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07730v1-abstract-short" style="display: inline;"> Dexterous hand teleoperation plays a pivotal role in enabling robots to achieve human-level manipulation dexterity. However, current teleoperation systems often rely on expensive equipment and lack multi-modal sensory feedback, restricting human operators' ability to perceive object properties and perform complex manipulation tasks. To address these limitations, we present DOGlove, a low-cost, pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07730v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07730v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07730v1-abstract-full" style="display: none;"> Dexterous hand teleoperation plays a pivotal role in enabling robots to achieve human-level manipulation dexterity. However, current teleoperation systems often rely on expensive equipment and lack multi-modal sensory feedback, restricting human operators' ability to perceive object properties and perform complex manipulation tasks. To address these limitations, we present DOGlove, a low-cost, precise, and haptic force feedback glove system for teleoperation and manipulation. DoGlove can be assembled in hours at a cost under 600 USD. It features a customized joint structure for 21-DoF motion capture, a compact cable-driven torque transmission mechanism for 5-DoF multidirectional force feedback, and a linear resonate actuator for 5-DoF fingertip haptic feedback. Leveraging action and haptic force retargeting, DOGlove enables precise and immersive teleoperation of dexterous robotic hands, achieving high success rates in complex, contact-rich tasks. We further evaluate DOGlove in scenarios without visual feedback, demonstrating the critical role of haptic force feedback in task performance. In addition, we utilize the collected demonstrations to train imitation learning policies, highlighting the potential and effectiveness of DOGlove. DOGlove's hardware and software system will be fully open-sourced at https://do-glove.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07730v1-abstract-full').style.display = 'none'; document.getElementById('2502.07730v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07577">arXiv:2502.07577</a> <span> [<a href="https://arxiv.org/pdf/2502.07577">pdf</a>, <a href="https://arxiv.org/format/2502.07577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Automated Capability Discovery via Model Self-Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+C">Cong Lu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengran Hu</a>, <a href="/search/cs?searchtype=author&query=Clune%2C+J">Jeff Clune</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07577v2-abstract-short" style="display: inline;"> Foundation models have become general-purpose assistants, exhibiting diverse capabilities across numerous domains through training on web-scale data. It remains challenging to precisely characterize even a fraction of the full spectrum of capabilities and potential risks in any new model. Existing evaluation approaches often require significant human effort, and it is taking increasing effort to d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07577v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07577v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07577v2-abstract-full" style="display: none;"> Foundation models have become general-purpose assistants, exhibiting diverse capabilities across numerous domains through training on web-scale data. It remains challenging to precisely characterize even a fraction of the full spectrum of capabilities and potential risks in any new model. Existing evaluation approaches often require significant human effort, and it is taking increasing effort to design ever harder challenges for more capable models. We introduce Automated Capability Discovery (ACD), a framework that designates one foundation model as a scientist to systematically propose open-ended tasks probing the abilities of a subject model (potentially itself). By combining frontier models with ideas from the field of open-endedness, ACD automatically and systematically uncovers both surprising capabilities and failures in the subject model. We demonstrate ACD across a range of foundation models (including the GPT, Claude, and Llama series), showing that it automatically reveals thousands of capabilities that would be challenging for any single team to uncover. We further validate our method's automated scoring with extensive human surveys, observing high agreement between model-generated and human evaluations. By leveraging foundation models' ability to both create tasks and self-evaluate, ACD is a significant step toward scalable, automated evaluation of novel AI systems. All code and evaluation logs are open-sourced at https://github.com/conglu1997/ACD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07577v2-abstract-full').style.display = 'none'; document.getElementById('2502.07577v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07373">arXiv:2502.07373</a> <span> [<a href="https://arxiv.org/pdf/2502.07373">pdf</a>, <a href="https://arxiv.org/format/2502.07373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> EvoFlow: Evolving Diverse Agentic Workflows On The Fly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kaijie Chen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Heng Chang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07373v1-abstract-short" style="display: inline;"> The past two years have witnessed the evolution of large language model (LLM)-based multi-agent systems from labor-intensive manual design to partial automation (\textit{e.g.}, prompt engineering, communication topology) and eventually to fully automated design. However, existing agentic automation pipelines often lack LLM heterogeneity and focus on single-objective performance optimization, limit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07373v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07373v1-abstract-full" style="display: none;"> The past two years have witnessed the evolution of large language model (LLM)-based multi-agent systems from labor-intensive manual design to partial automation (\textit{e.g.}, prompt engineering, communication topology) and eventually to fully automated design. However, existing agentic automation pipelines often lack LLM heterogeneity and focus on single-objective performance optimization, limiting their potential to combine weaker models for more customized and cost-effective solutions. To address this challenge, we propose EvoFlow, a niching evolutionary algorithm-based framework to automatically search a population of heterogeneous and complexity-adaptive agentic workflows, rather than a single homogeneous, complex workflow. Technically, EvoFlow performs \textit{(1) tag-based retrieval} to extract parent workflows from an agentic population, evolves new workflows through \textit{(2) crossover} and \textit{(3) mutation}, and employs \textit{(4) niching-based selection} to maintain population diversity and quality. Extensive evaluations across seven benchmarks demonstrate that EvoFlow is: \textbf{(I) diverse}, evolving a population of workflows ranging from simple I/O tasks to complex multi-turn interactions; \textbf{(II) high-performing}, outperforming previous handcrafted and automated workflows by $1.23\%\sim29.86\%$; \textbf{(III) economical}, surpassing powerful \llmname{o1-preview} at $12.4\%$ of its inference cost using weaker open-source models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07373v1-abstract-full').style.display = 'none'; document.getElementById('2502.07373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06838">arXiv:2502.06838</a> <span> [<a href="https://arxiv.org/pdf/2502.06838">pdf</a>, <a href="https://arxiv.org/format/2502.06838">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TorchResist: Open-Source Differentiable Resist Simulator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixiao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jieya Zhou</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Su Zheng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+S">Shuo Yin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kaichao Liang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shoubo Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiao Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06838v1-abstract-short" style="display: inline;"> Recent decades have witnessed remarkable advancements in artificial intelligence (AI), including large language models (LLMs), image and video generative models, and embodied AI systems. These advancements have led to an explosive increase in the demand for computational power, challenging the limits of Moore's Law. Optical lithography, a critical technology in semiconductor manufacturing, faces s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06838v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06838v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06838v1-abstract-full" style="display: none;"> Recent decades have witnessed remarkable advancements in artificial intelligence (AI), including large language models (LLMs), image and video generative models, and embodied AI systems. These advancements have led to an explosive increase in the demand for computational power, challenging the limits of Moore's Law. Optical lithography, a critical technology in semiconductor manufacturing, faces significant challenges due to its high costs. To address this, various lithography simulators have been developed. However, many of these simulators are limited by their inadequate photoresist modeling capabilities. This paper presents TorchResist, an open-source, differentiable photoresist simulator.TorchResist employs an analytical approach to model the photoresist process, functioning as a white-box system with at most twenty interpretable parameters. Leveraging modern differentiable programming techniques and parallel computing on GPUs, TorchResist enables seamless co-optimization with other tools across multiple related tasks. Our experimental results demonstrate that TorchResist achieves superior accuracy and efficiency compared to existing solutions. The source code is publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06838v1-abstract-full').style.display = 'none'; document.getElementById('2502.06838v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SPIE Advanced Lithography + Patterning, 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06817">arXiv:2502.06817</a> <span> [<a href="https://arxiv.org/pdf/2502.06817">pdf</a>, <a href="https://arxiv.org/format/2502.06817">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-empowered AutoPrompt MedSAM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiashu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongtu Zhu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06817v1-abstract-short" style="display: inline;"> MedSAM, a medical foundation model derived from the SAM architecture, has demonstrated notable success across diverse medical domains. However, its clinical application faces two major challenges: the dependency on labor-intensive manual prompt generation, which imposes a significant burden on clinicians, and the absence of semantic labeling in the generated segmentation masks for organs or lesion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06817v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06817v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06817v1-abstract-full" style="display: none;"> MedSAM, a medical foundation model derived from the SAM architecture, has demonstrated notable success across diverse medical domains. However, its clinical application faces two major challenges: the dependency on labor-intensive manual prompt generation, which imposes a significant burden on clinicians, and the absence of semantic labeling in the generated segmentation masks for organs or lesions, limiting its practicality for non-expert users. To address these limitations, we propose AutoMedSAM, an end-to-end framework derived from SAM, designed to enhance usability and segmentation performance. AutoMedSAM retains MedSAM's image encoder and mask decoder structure while introducing a novel diffusion-based class prompt encoder. The diffusion-based encoder employs a dual-decoder structure to collaboratively generate prompt embeddings guided by sparse and dense prompt definitions. These embeddings enhance the model's ability to understand and process clinical imagery autonomously. With this encoder, AutoMedSAM leverages class prompts to embed semantic information into the model's predictions, transforming MedSAM's semi-automated pipeline into a fully automated workflow. Furthermore, AutoMedSAM employs an uncertainty-aware joint optimization strategy during training to effectively inherit MedSAM's pre-trained knowledge while improving generalization by integrating multiple loss functions. Experimental results across diverse datasets demonstrate that AutoMedSAM achieves superior performance while broadening its applicability to both clinical settings and non-expert users. Code is available at https://github.com/HP-ML/AutoPromptMedSAM.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06817v1-abstract-full').style.display = 'none'; document.getElementById('2502.06817v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06498">arXiv:2502.06498</a> <span> [<a href="https://arxiv.org/pdf/2502.06498">pdf</a>, <a href="https://arxiv.org/format/2502.06498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decision Boundary Optimization-Informed Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+L">Lingkun Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiqiang Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jie Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06498v1-abstract-short" style="display: inline;"> Maximum Mean Discrepancy (MMD) is widely used in a number of domain adaptation (DA) methods and shows its effectiveness in aligning data distributions across domains. However, in previous DA research, MMD-based DA methods focus mostly on distribution alignment, and ignore to optimize the decision boundary for classification-aware DA, thereby falling short in reducing the DA upper error bound. In t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06498v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06498v1-abstract-full" style="display: none;"> Maximum Mean Discrepancy (MMD) is widely used in a number of domain adaptation (DA) methods and shows its effectiveness in aligning data distributions across domains. However, in previous DA research, MMD-based DA methods focus mostly on distribution alignment, and ignore to optimize the decision boundary for classification-aware DA, thereby falling short in reducing the DA upper error bound. In this paper, we propose a strengthened MMD measurement, namely, Decision Boundary optimization-informed MMD (DB-MMD), which enables MMD to carefully take into account the decision boundaries, thereby simultaneously optimizing the distribution alignment and cross-domain classifier within a hybrid framework, and leading to a theoretical bound guided DA. We further seamlessly embed the proposed DB-MMD measurement into several popular DA methods, e.g., MEDA, DGA-DA, to demonstrate its effectiveness w.r.t different experimental settings. We carry out comprehensive experiments using 8 standard DA datasets. The experimental results show that the DB-MMD enforced DA methods improve their baseline models using plain vanilla MMD, with a margin that can be as high as 9.5. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06498v1-abstract-full').style.display = 'none'; document.getElementById('2502.06498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06272">arXiv:2502.06272</a> <span> [<a href="https://arxiv.org/pdf/2502.06272">pdf</a>, <a href="https://arxiv.org/format/2502.06272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Beyond Batch Learning: Global Awareness Enhanced Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+L">Lingkun Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiqiang Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06272v1-abstract-short" style="display: inline;"> In domain adaptation (DA), the effectiveness of deep learning-based models is often constrained by batch learning strategies that fail to fully apprehend the global statistical and geometric characteristics of data distributions. Addressing this gap, we introduce 'Global Awareness Enhanced Domain Adaptation' (GAN-DA), a novel approach that transcends traditional batch-based limitations. GAN-DA int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06272v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06272v1-abstract-full" style="display: none;"> In domain adaptation (DA), the effectiveness of deep learning-based models is often constrained by batch learning strategies that fail to fully apprehend the global statistical and geometric characteristics of data distributions. Addressing this gap, we introduce 'Global Awareness Enhanced Domain Adaptation' (GAN-DA), a novel approach that transcends traditional batch-based limitations. GAN-DA integrates a unique predefined feature representation (PFR) to facilitate the alignment of cross-domain distributions, thereby achieving a comprehensive global statistical awareness. This representation is innovatively expanded to encompass orthogonal and common feature aspects, which enhances the unification of global manifold structures and refines decision boundaries for more effective DA. Our extensive experiments, encompassing 27 diverse cross-domain image classification tasks, demonstrate GAN-DA's remarkable superiority, outperforming 24 established DA methods by a significant margin. Furthermore, our in-depth analyses shed light on the decision-making processes, revealing insights into the adaptability and efficiency of GAN-DA. This approach not only addresses the limitations of existing DA methodologies but also sets a new benchmark in the realm of domain adaptation, offering broad implications for future research and applications in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06272v1-abstract-full').style.display = 'none'; document.getElementById('2502.06272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06134">arXiv:2502.06134</a> <span> [<a href="https://arxiv.org/pdf/2502.06134">pdf</a>, <a href="https://arxiv.org/format/2502.06134">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Integrating Sequence and Image Modeling in Irregular Medical Time Series Through Self-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuqing Chen</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+S">Shuhong Xiao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shixian Ding</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shanhai Hu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lingyun Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06134v1-abstract-short" style="display: inline;"> Medical time series are often irregular and face significant missingness, posing challenges for data analysis and clinical decision-making. Existing methods typically adopt a single modeling perspective, either treating series data as sequences or transforming them into image representations for further classification. In this paper, we propose a joint learning framework that incorporates both seq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06134v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06134v1-abstract-full" style="display: none;"> Medical time series are often irregular and face significant missingness, posing challenges for data analysis and clinical decision-making. Existing methods typically adopt a single modeling perspective, either treating series data as sequences or transforming them into image representations for further classification. In this paper, we propose a joint learning framework that incorporates both sequence and image representations. We also design three self-supervised learning strategies to facilitate the fusion of sequence and image representations, capturing a more generalizable joint representation. The results indicate that our approach outperforms seven other state-of-the-art models in three representative real-world clinical datasets. We further validate our approach by simulating two major types of real-world missingness through leave-sensors-out and leave-samples-out techniques. The results demonstrate that our approach is more robust and significantly surpasses other baselines in terms of classification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06134v1-abstract-full').style.display = 'none'; document.getElementById('2502.06134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 2 figures, AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05787">arXiv:2502.05787</a> <span> [<a href="https://arxiv.org/pdf/2502.05787">pdf</a>, <a href="https://arxiv.org/format/2502.05787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> TAP-CAM: A Tunable Approximate Matching Engine based on Ferroelectric Content Addressable Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chenyu Ni</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sijie Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Che-Kai Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liu Liu</a>, <a href="/search/cs?searchtype=author&query=Imani%2C+M">Mohsen Imani</a>, <a href="/search/cs?searchtype=author&query=Kampfe%2C+T">Thomas Kampfe</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+K">Kai Ni</a>, <a href="/search/cs?searchtype=author&query=Niemier%2C+M">Michael Niemier</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X+S">Xiaobo Sharon Hu</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+C">Cheng Zhuo</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+X">Xunzhao Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05787v1-abstract-short" style="display: inline;"> Pattern search is crucial in numerous analytic applications for retrieving data entries akin to the query. Content Addressable Memories (CAMs), an in-memory computing fabric, directly compare input queries with stored entries through embedded comparison logic, facilitating fast parallel pattern search in memory. While conventional CAM designs offer exact match functionality, they are inadequate fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05787v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05787v1-abstract-full" style="display: none;"> Pattern search is crucial in numerous analytic applications for retrieving data entries akin to the query. Content Addressable Memories (CAMs), an in-memory computing fabric, directly compare input queries with stored entries through embedded comparison logic, facilitating fast parallel pattern search in memory. While conventional CAM designs offer exact match functionality, they are inadequate for meeting the approximate search needs of emerging data-intensive applications. Some recent CAM designs propose approximate matching functions, but they face limitations such as excessively large cell area or the inability to precisely control the degree of approximation. In this paper, we propose TAP-CAM, a novel ferroelectric field effect transistor (FeFET) based ternary CAM (TCAM) capable of both exact and tunable approximate matching. TAP-CAM employs a compact 2FeFET-2R cell structure as the entry storage unit, and similarities in Hamming distances between input queries and stored entries are measured using an evaluation transistor associated with the matchline of CAM array. The operation, robustness and performance of the proposed design at array level have been discussed and evaluated, respectively. We conduct a case study of K-nearest neighbor (KNN) search to benchmark the proposed TAP-CAM at application level. Results demonstrate that compared to 16T CMOS CAM with exact match functionality, TAP-CAM achieves a 16.95x energy improvement, along with a 3.06% accuracy enhancement. Compared to 2FeFET TCAM with approximate match functionality, TAP-CAM achieves a 6.78x energy improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05787v1-abstract-full').style.display = 'none'; document.getElementById('2502.05787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04492">arXiv:2502.04492</a> <span> [<a href="https://arxiv.org/pdf/2502.04492">pdf</a>, <a href="https://arxiv.org/format/2502.04492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-Agent Reinforcement Learning with Focal Diversity Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tekin%2C+S+F">Selim Furkan Tekin</a>, <a href="/search/cs?searchtype=author&query=Ilhan%2C+F">Fatih Ilhan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiansheng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sihao Hu</a>, <a href="/search/cs?searchtype=author&query=Yahn%2C+Z">Zachary Yahn</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04492v1-abstract-short" style="display: inline;"> The advancement of Large Language Models (LLMs) and their finetuning strategies has triggered the renewed interests in multi-agent reinforcement learning. In this paper, we introduce a focal diversity-optimized multi-agent reinforcement learning approach, coined as MARL-Focal, with three unique characteristics. First, we develop an agent-fusion framework for encouraging multiple LLM based agents t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04492v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04492v1-abstract-full" style="display: none;"> The advancement of Large Language Models (LLMs) and their finetuning strategies has triggered the renewed interests in multi-agent reinforcement learning. In this paper, we introduce a focal diversity-optimized multi-agent reinforcement learning approach, coined as MARL-Focal, with three unique characteristics. First, we develop an agent-fusion framework for encouraging multiple LLM based agents to collaborate in producing the final inference output for each LLM query. Second, we develop a focal-diversity optimized agent selection algorithm that can choose a small subset of the available agents based on how well they can complement one another to generate the query output. Finally, we design a conflict-resolution method to detect output inconsistency among multiple agents and produce our MARL-Focal output through reward-aware and policy-adaptive inference fusion. Extensive evaluations on five benchmarks show that MARL-Focal is cost-efficient and adversarial-robust. Our multi-agent fusion model achieves performance improvement of 5.51\% compared to the best individual LLM-agent and offers stronger robustness over the TruthfulQA benchmark. Code is available at https://github.com/sftekin/rl-focal <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04492v1-abstract-full').style.display = 'none'; document.getElementById('2502.04492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03375">arXiv:2502.03375</a> <span> [<a href="https://arxiv.org/pdf/2502.03375">pdf</a>, <a href="https://arxiv.org/format/2502.03375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714697">10.1145/3696410.3714697 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Interactive Visualization Recommendation with Hier-SUCB </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songwen Hu</a>, <a href="/search/cs?searchtype=author&query=Rossi%2C+R+A">Ryan A. Rossi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tong Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junda Wu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Handong Zhao</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">Sungchul Kim</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuai Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03375v3-abstract-short" style="display: inline;"> Visualization recommendation aims to enable rapid visual analysis of massive datasets. In real-world scenarios, it is essential to quickly gather and comprehend user preferences to cover users from diverse backgrounds, including varying skill levels and analytical tasks. Previous approaches to personalized visualization recommendations are non-interactive and rely on initial user data for new user… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03375v3-abstract-full').style.display = 'inline'; document.getElementById('2502.03375v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03375v3-abstract-full" style="display: none;"> Visualization recommendation aims to enable rapid visual analysis of massive datasets. In real-world scenarios, it is essential to quickly gather and comprehend user preferences to cover users from diverse backgrounds, including varying skill levels and analytical tasks. Previous approaches to personalized visualization recommendations are non-interactive and rely on initial user data for new users. As a result, these models cannot effectively explore options or adapt to real-time feedback. To address this limitation, we propose an interactive personalized visualization recommendation (PVisRec) system that learns on user feedback from previous interactions. For more interactive and accurate recommendations, we propose Hier-SUCB, a contextual combinatorial semi-bandit in the PVisRec setting. Theoretically, we show an improved overall regret bound with the same rank of time but an improved rank of action space. We further demonstrate the effectiveness of Hier-SUCB through extensive experiments where it is comparable to offline methods and outperforms other bandit algorithms in the setting of visualization recommendation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03375v3-abstract-full').style.display = 'none'; document.getElementById('2502.03375v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18724">arXiv:2501.18724</a> <span> [<a href="https://arxiv.org/pdf/2501.18724">pdf</a>, <a href="https://arxiv.org/format/2501.18724">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Zero-shot Large Language Models for Long Clinical Text Summarization with Temporal Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kruse%2C+M">Maya Kruse</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiyue Hu</a>, <a href="/search/cs?searchtype=author&query=Derby%2C+N">Nicholas Derby</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yifu Wu</a>, <a href="/search/cs?searchtype=author&query=Stonbraker%2C+S">Samantha Stonbraker</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bingsheng Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dakuo Wang</a>, <a href="/search/cs?searchtype=author&query=Goldberg%2C+E">Elizabeth Goldberg</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yanjun Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18724v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have shown potential for transforming data processing in healthcare, particularly in understanding complex clinical narratives. This study evaluates the efficacy of zero-shot LLMs in summarizing long clinical texts that require temporal reasoning, a critical aspect for comprehensively capturing patient histories and treatment trajectories. We app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18724v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18724v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18724v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have shown potential for transforming data processing in healthcare, particularly in understanding complex clinical narratives. This study evaluates the efficacy of zero-shot LLMs in summarizing long clinical texts that require temporal reasoning, a critical aspect for comprehensively capturing patient histories and treatment trajectories. We applied a series of advanced zero-shot LLMs to extensive clinical documents, assessing their ability to integrate and accurately reflect temporal dynamics without prior task-specific training. While the models efficiently identified key temporal events, they struggled with chronological coherence over prolonged narratives. The evaluation, combining quantitative and qualitative methods, highlights the strengths and limitations of zero-shot LLMs in clinical text summarization. The results suggest that while promising, zero-shot LLMs require further refinement to effectively support clinical decision-making processes, underscoring the need for enhanced model training approaches that better capture the nuances of temporal information in long context medical documents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18724v1-abstract-full').style.display = 'none'; document.getElementById('2501.18724v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17433">arXiv:2501.17433</a> <span> [<a href="https://arxiv.org/pdf/2501.17433">pdf</a>, <a href="https://arxiv.org/format/2501.17433">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Virus: Harmful Fine-tuning Attack for Large Language Models Bypassing Guardrail Moderation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiansheng Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sihao Hu</a>, <a href="/search/cs?searchtype=author&query=Ilhan%2C+F">Fatih Ilhan</a>, <a href="/search/cs?searchtype=author&query=Tekin%2C+S+F">Selim Furkan Tekin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Ling Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17433v1-abstract-short" style="display: inline;"> Recent research shows that Large Language Models (LLMs) are vulnerable to harmful fine-tuning attacks -- models lose their safety alignment ability after fine-tuning on a few harmful samples. For risk mitigation, a guardrail is typically used to filter out harmful samples before fine-tuning. By designing a new red-teaming method, we in this paper show that purely relying on the moderation guardrai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17433v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17433v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17433v1-abstract-full" style="display: none;"> Recent research shows that Large Language Models (LLMs) are vulnerable to harmful fine-tuning attacks -- models lose their safety alignment ability after fine-tuning on a few harmful samples. For risk mitigation, a guardrail is typically used to filter out harmful samples before fine-tuning. By designing a new red-teaming method, we in this paper show that purely relying on the moderation guardrail for data filtration is not reliable. Our proposed attack method, dubbed Virus, easily bypasses the guardrail moderation by slightly modifying the harmful data. Experimental results show that the harmful data optimized by Virus is not detectable by the guardrail with up to 100\% leakage ratio, and can simultaneously achieve superior attack performance. Finally, the key message we want to convey through this paper is that: \textbf{it is reckless to consider guardrail moderation as a clutch at straws towards harmful fine-tuning attack}, as it cannot solve the inherent safety issue of the pre-trained LLMs. Our code is available at https://github.com/git-disl/Virus <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17433v1-abstract-full').style.display = 'none'; document.getElementById('2501.17433v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17183">arXiv:2501.17183</a> <span> [<a href="https://arxiv.org/pdf/2501.17183">pdf</a>, <a href="https://arxiv.org/format/2501.17183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM Evaluation Based on Aerospace Manufacturing Expertise: Automated Generation and Multi-Model Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+B">Beiming Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhizhuo Cui</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Siteng Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohua Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haifeng Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengxin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17183v2-abstract-short" style="display: inline;"> Aerospace manufacturing demands exceptionally high precision in technical parameters. The remarkable performance of Large Language Models (LLMs), such as GPT-4 and QWen, in Natural Language Processing has sparked industry interest in their application to tasks including process design, material selection, and tool information retrieval. However, LLMs are prone to generating "hallucinations" in spe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17183v2-abstract-full').style.display = 'inline'; document.getElementById('2501.17183v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17183v2-abstract-full" style="display: none;"> Aerospace manufacturing demands exceptionally high precision in technical parameters. The remarkable performance of Large Language Models (LLMs), such as GPT-4 and QWen, in Natural Language Processing has sparked industry interest in their application to tasks including process design, material selection, and tool information retrieval. However, LLMs are prone to generating "hallucinations" in specialized domains, producing inaccurate or false information that poses significant risks to the quality of aerospace products and flight safety. This paper introduces a set of evaluation metrics tailored for LLMs in aerospace manufacturing, aiming to assess their accuracy by analyzing their performance in answering questions grounded in professional knowledge. Firstly, key information is extracted through in-depth textual analysis of classic aerospace manufacturing textbooks and guidelines. Subsequently, utilizing LLM generation techniques, we meticulously construct multiple-choice questions with multiple correct answers of varying difficulty. Following this, different LLM models are employed to answer these questions, and their accuracy is recorded. Experimental results demonstrate that the capabilities of LLMs in aerospace professional knowledge are in urgent need of improvement. This study provides a theoretical foundation and practical guidance for the application of LLMs in aerospace manufacturing, addressing a critical gap in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17183v2-abstract-full').style.display = 'none'; document.getElementById('2501.17183v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference paper</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T50; 90B30 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16309">arXiv:2501.16309</a> <span> [<a href="https://arxiv.org/pdf/2501.16309">pdf</a>, <a href="https://arxiv.org/format/2501.16309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating The Performance of Using Large Language Models to Automate Summarization of CT Simulation Orders in Radiation Oncology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meiyun Cao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shaw Hu</a>, <a href="/search/cs?searchtype=author&query=Sharp%2C+J">Jason Sharp</a>, <a href="/search/cs?searchtype=author&query=Clouser%2C+E">Edward Clouser</a>, <a href="/search/cs?searchtype=author&query=Holmes%2C+J">Jason Holmes</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+L+L">Linda L. Lam</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+X">Xiaoning Ding</a>, <a href="/search/cs?searchtype=author&query=Toesca%2C+D+S">Diego Santos Toesca</a>, <a href="/search/cs?searchtype=author&query=Lindholm%2C+W+S">Wendy S. Lindholm</a>, <a href="/search/cs?searchtype=author&query=Patel%2C+S+H">Samir H. Patel</a>, <a href="/search/cs?searchtype=author&query=Vora%2C+S+A">Sujay A. Vora</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peilong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16309v1-abstract-short" style="display: inline;"> Purpose: This study aims to use a large language model (LLM) to automate the generation of summaries from the CT simulation orders and evaluate its performance. Materials and Methods: A total of 607 CT simulation orders for patients were collected from the Aria database at our institution. A locally hosted Llama 3.1 405B model, accessed via the Application Programming Interface (API) service, wa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16309v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16309v1-abstract-full" style="display: none;"> Purpose: This study aims to use a large language model (LLM) to automate the generation of summaries from the CT simulation orders and evaluate its performance. Materials and Methods: A total of 607 CT simulation orders for patients were collected from the Aria database at our institution. A locally hosted Llama 3.1 405B model, accessed via the Application Programming Interface (API) service, was used to extract keywords from the CT simulation orders and generate summaries. The downloaded CT simulation orders were categorized into seven groups based on treatment modalities and disease sites. For each group, a customized instruction prompt was developed collaboratively with therapists to guide the Llama 3.1 405B model in generating summaries. The ground truth for the corresponding summaries was manually derived by carefully reviewing each CT simulation order and subsequently verified by therapists. The accuracy of the LLM-generated summaries was evaluated by therapists using the verified ground truth as a reference. Results: About 98% of the LLM-generated summaries aligned with the manually generated ground truth in terms of accuracy. Our evaluations showed an improved consistency in format and enhanced readability of the LLM-generated summaries compared to the corresponding therapists-generated summaries. This automated approach demonstrated a consistent performance across all groups, regardless of modality or disease site. Conclusions: This study demonstrated the high precision and consistency of the Llama 3.1 405B model in extracting keywords and summarizing CT simulation orders, suggesting that LLMs have great potential to help with this task, reduce the workload of therapists and improve workflow efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16309v1-abstract-full').style.display = 'none'; document.getElementById('2501.16309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12984">arXiv:2501.12984</a> <span> [<a href="https://arxiv.org/pdf/2501.12984">pdf</a>, <a href="https://arxiv.org/ps/2501.12984">ps</a>, <a href="https://arxiv.org/format/2501.12984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Lower Bounds on the Sub-Packetization of Optimal-Access MSR Codes for Multiple-Node Repair </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lewen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Sihuang Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12984v1-abstract-short" style="display: inline;"> We establish lower bounds on the sub-packetization of optimal-access MSR codes in the context of multiple-node failures. These bounds generalize the tight bounds for single-node failure presented by Balaji et al. (IEEE Transactions on Information Theory, vol. 68, no. 10, 2022). Moreover, we utilize generating functions to provide a more refined analysis, further strengthening these bounds. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12984v1-abstract-full" style="display: none;"> We establish lower bounds on the sub-packetization of optimal-access MSR codes in the context of multiple-node failures. These bounds generalize the tight bounds for single-node failure presented by Balaji et al. (IEEE Transactions on Information Theory, vol. 68, no. 10, 2022). Moreover, we utilize generating functions to provide a more refined analysis, further strengthening these bounds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12984v1-abstract-full').style.display = 'none'; document.getElementById('2501.12984v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12766">arXiv:2501.12766</a> <span> [<a href="https://arxiv.org/pdf/2501.12766">pdf</a>, <a href="https://arxiv.org/format/2501.12766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NExtLong: Toward Effective Long-Context Training without Long Documents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chaochen Gao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xing Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Debing Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songlin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12766v1-abstract-short" style="display: inline;"> Large language models (LLMs) with extended context windows have made significant strides yet remain a challenge due to the scarcity of long documents. Existing methods tend to synthesize long-context data but lack a clear mechanism to reinforce the long-range dependency modeling. To address this limitation, we propose NExtLong, a novel framework for synthesizing long-context data through Negative… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12766v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12766v1-abstract-full" style="display: none;"> Large language models (LLMs) with extended context windows have made significant strides yet remain a challenge due to the scarcity of long documents. Existing methods tend to synthesize long-context data but lack a clear mechanism to reinforce the long-range dependency modeling. To address this limitation, we propose NExtLong, a novel framework for synthesizing long-context data through Negative document Extension. NExtLong decomposes a document into multiple meta-chunks and extends the context by interleaving hard negative distractors retrieved from pretraining corpora. This approach compels the model to discriminate long-range dependent context from distracting content, enhancing its ability to model long-range dependencies. Extensive experiments demonstrate that NExtLong achieves significant performance improvements on the HELMET and RULER benchmarks compared to existing long-context synthesis approaches and leading models, which are trained on non-synthetic long documents. These findings highlight NExtLong's ability to reduce reliance on non-synthetic long documents, making it an effective framework for developing advanced long-context LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12766v1-abstract-full').style.display = 'none'; document.getElementById('2501.12766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Corresponding authors: xing wu, and songlin hu</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12281">arXiv:2501.12281</a> <span> [<a href="https://arxiv.org/pdf/2501.12281">pdf</a>, <a href="https://arxiv.org/format/2501.12281">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MoGERNN: An Inductive Traffic Predictor for Unobserved Locations in Dynamic Sensing Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qishen Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Makridis%2C+M+A">Michail A. Makridis</a>, <a href="/search/cs?searchtype=author&query=Kouvelas%2C+A">Anastasios Kouvelas</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yibing Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Simon Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12281v1-abstract-short" style="display: inline;"> Given a partially observed road network, how can we predict the traffic state of unobserved locations? While deep learning approaches show exceptional performance in traffic prediction, most assume sensors at all locations of interest, which is impractical due to financial constraints. Furthermore, these methods typically require costly retraining when sensor configurations change. We propose MoGE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12281v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12281v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12281v1-abstract-full" style="display: none;"> Given a partially observed road network, how can we predict the traffic state of unobserved locations? While deep learning approaches show exceptional performance in traffic prediction, most assume sensors at all locations of interest, which is impractical due to financial constraints. Furthermore, these methods typically require costly retraining when sensor configurations change. We propose MoGERNN, an inductive spatio-temporal graph representation model, to address these challenges. Inspired by the Mixture of Experts approach in Large Language Models, we introduce a Mixture of Graph Expert (MoGE) block to model complex spatial dependencies through multiple graph message aggregators and a sparse gating network. This block estimates initial states for unobserved locations, which are then processed by a GRU-based Encoder-Decoder that integrates a graph message aggregator to capture spatio-temporal dependencies and predict future states. Experiments on two real-world datasets show MoGERNN consistently outperforms baseline methods for both observed and unobserved locations. MoGERNN can accurately predict congestion evolution even in areas without sensors, offering valuable information for traffic management. Moreover, MoGERNN is adaptable to dynamic sensing networks, maintaining competitive performance even compared to its retrained counterpart. Tests with different numbers of available sensors confirm its consistent superiority, and ablation studies validate the effectiveness of its key modules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12281v1-abstract-full').style.display = 'none'; document.getElementById('2501.12281v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12104">arXiv:2501.12104</a> <span> [<a href="https://arxiv.org/pdf/2501.12104">pdf</a>, <a href="https://arxiv.org/format/2501.12104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Teacher Encoder-Student Decoder Denoising Guided Segmentation Network for Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+S">Shixuan Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinrong Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12104v3-abstract-short" style="display: inline;"> Visual anomaly detection is a highly challenging task, often categorized as a one-class classification and segmentation problem. Recent studies have demonstrated that the student-teacher (S-T) framework effectively addresses this challenge. However, most S-T frameworks rely solely on pre-trained teacher networks to guide student networks in learning multi-scale similar features, overlooking the po… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12104v3-abstract-full').style.display = 'inline'; document.getElementById('2501.12104v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12104v3-abstract-full" style="display: none;"> Visual anomaly detection is a highly challenging task, often categorized as a one-class classification and segmentation problem. Recent studies have demonstrated that the student-teacher (S-T) framework effectively addresses this challenge. However, most S-T frameworks rely solely on pre-trained teacher networks to guide student networks in learning multi-scale similar features, overlooking the potential of the student networks to enhance learning through multi-scale feature fusion. In this study, we propose a novel model named PFADSeg, which integrates a pre-trained teacher network, a denoising student network with multi-scale feature fusion, and a guided anomaly segmentation network into a unified framework. By adopting a unique teacher-encoder and student-decoder denoising mode, the model improves the student network's ability to learn from teacher network features. Furthermore, an adaptive feature fusion mechanism is introduced to train a self-supervised segmentation network that synthesizes anomaly masks autonomously, significantly increasing detection performance. Evaluated on the MVTec AD dataset, PFADSeg achieves state-of-the-art results with an image-level AUC of 98.9%, a pixel-level mean precision of 76.4%, and an instance-level mean precision of 78.7%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12104v3-abstract-full').style.display = 'none'; document.getElementById('2501.12104v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11884">arXiv:2501.11884</a> <span> [<a href="https://arxiv.org/pdf/2501.11884">pdf</a>, <a href="https://arxiv.org/format/2501.11884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fast Underwater Scene Reconstruction using Multi-View Stereo and Physical Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyi Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11884v1-abstract-short" style="display: inline;"> Underwater scene reconstruction poses a substantial challenge because of the intricate interplay between light and the medium, resulting in scattering and absorption effects that make both depth estimation and rendering more complex. While recent Neural Radiance Fields (NeRF) based methods for underwater scenes achieve high-quality results by modeling and separating the scattering medium, they sti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11884v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11884v1-abstract-full" style="display: none;"> Underwater scene reconstruction poses a substantial challenge because of the intricate interplay between light and the medium, resulting in scattering and absorption effects that make both depth estimation and rendering more complex. While recent Neural Radiance Fields (NeRF) based methods for underwater scenes achieve high-quality results by modeling and separating the scattering medium, they still suffer from slow training and rendering speeds. To address these limitations, we propose a novel method that integrates Multi-View Stereo (MVS) with a physics-based underwater image formation model. Our approach consists of two branches: one for depth estimation using the traditional cost volume pipeline of MVS, and the other for rendering based on the physics-based image formation model. The depth branch improves scene geometry, while the medium branch determines the scattering parameters to achieve precise scene rendering. Unlike traditional MVSNet methods that rely on ground-truth depth, our method does not necessitate the use of depth truth, thus allowing for expedited training and rendering processes. By leveraging the medium subnet to estimate the medium parameters and combining this with a color MLP for rendering, we restore the true colors of underwater scenes and achieve higher-fidelity geometric representations. Experimental results show that our method enables high-quality synthesis of novel views in scattering media, clear views restoration by removing the medium, and outperforms existing methods in rendering quality and training efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11884v1-abstract-full').style.display = 'none'; document.getElementById('2501.11884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11858">arXiv:2501.11858</a> <span> [<a href="https://arxiv.org/pdf/2501.11858">pdf</a>, <a href="https://arxiv.org/format/2501.11858">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EmbodiedEval: Evaluate Multimodal LLMs as Embodied Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zhili Cheng</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Y">Yuge Tu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ran Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+S">Shiqi Dai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinyi Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengding Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yang Shi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tianyu Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lei Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11858v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have shown significant advancements, providing a promising future for embodied agents. Existing benchmarks for evaluating MLLMs primarily utilize static images or videos, limiting assessments to non-interactive scenarios. Meanwhile, existing embodied AI benchmarks are task-specific and not diverse enough, which do not adequately evaluate the embodied capabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11858v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11858v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11858v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have shown significant advancements, providing a promising future for embodied agents. Existing benchmarks for evaluating MLLMs primarily utilize static images or videos, limiting assessments to non-interactive scenarios. Meanwhile, existing embodied AI benchmarks are task-specific and not diverse enough, which do not adequately evaluate the embodied capabilities of MLLMs. To address this, we propose EmbodiedEval, a comprehensive and interactive evaluation benchmark for MLLMs with embodied tasks. EmbodiedEval features 328 distinct tasks within 125 varied 3D scenes, each of which is rigorously selected and annotated. It covers a broad spectrum of existing embodied AI tasks with significantly enhanced diversity, all within a unified simulation and evaluation framework tailored for MLLMs. The tasks are organized into five categories: navigation, object interaction, social interaction, attribute question answering, and spatial question answering to assess different capabilities of the agents. We evaluated the state-of-the-art MLLMs on EmbodiedEval and found that they have a significant shortfall compared to human level on embodied tasks. Our analysis demonstrates the limitations of existing MLLMs in embodied capabilities, providing insights for their future development. We open-source all evaluation data and simulation framework at https://github.com/thunlp/EmbodiedEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11858v1-abstract-full').style.display = 'none'; document.getElementById('2501.11858v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10687">arXiv:2501.10687</a> <span> [<a href="https://arxiv.org/pdf/2501.10687">pdf</a>, <a href="https://arxiv.org/format/2501.10687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EMO2: End-Effector Guided Audio-Driven Avatar Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+L">Linrui Tian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Siqi Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bang Zhang</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10687v1-abstract-short" style="display: inline;"> In this paper, we propose a novel audio-driven talking head method capable of simultaneously generating highly expressive facial expressions and hand gestures. Unlike existing methods that focus on generating full-body or half-body poses, we investigate the challenges of co-speech gesture generation and identify the weak correspondence between audio features and full-body gestures as a key limitat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10687v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10687v1-abstract-full" style="display: none;"> In this paper, we propose a novel audio-driven talking head method capable of simultaneously generating highly expressive facial expressions and hand gestures. Unlike existing methods that focus on generating full-body or half-body poses, we investigate the challenges of co-speech gesture generation and identify the weak correspondence between audio features and full-body gestures as a key limitation. To address this, we redefine the task as a two-stage process. In the first stage, we generate hand poses directly from audio input, leveraging the strong correlation between audio signals and hand movements. In the second stage, we employ a diffusion model to synthesize video frames, incorporating the hand poses generated in the first stage to produce realistic facial expressions and body movements. Our experimental results demonstrate that the proposed method outperforms state-of-the-art approaches, such as CyberHost and Vlogger, in terms of both visual quality and synchronization accuracy. This work provides a new perspective on audio-driven gesture generation and a robust framework for creating expressive and natural talking head animations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10687v1-abstract-full').style.display = 'none'; document.getElementById('2501.10687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07394">arXiv:2501.07394</a> <span> [<a href="https://arxiv.org/pdf/2501.07394">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring the distribution of connectivity weights in resting-state EEG networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiang Hu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiao Gong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaolong Huang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+J">Jie Ruan</a>, <a href="/search/cs?searchtype=author&query=Valdes-Sosa%2C+P+A">Pedro Antonio Valdes-Sosa</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07394v2-abstract-short" style="display: inline;"> The resting-state brain networks (RSNs) reflects the functional connectivity patterns between brain modules, providing essential foundations for decoding intrinsic neural information within the brain. It serves as one of the primary tools for describing the spatial dynamics of the brain using various neuroimaging techniques, such as electroencephalography (EEG) and magnetoencephalography (MEG). Ho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07394v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07394v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07394v2-abstract-full" style="display: none;"> The resting-state brain networks (RSNs) reflects the functional connectivity patterns between brain modules, providing essential foundations for decoding intrinsic neural information within the brain. It serves as one of the primary tools for describing the spatial dynamics of the brain using various neuroimaging techniques, such as electroencephalography (EEG) and magnetoencephalography (MEG). However, the distribution rules or potential modes of functional connectivity weights in the resting state remain unclear. In this context, we first start from simulation, using forward solving model to generate scalp EEG with four channel densities (19, 32, 64, 128). Subsequently, we construct scalp brain networks using five coupling measures, aiming to explore whether different channel density or coupling measures affect the distribution pattern of functional connectivity weights. Next, we quantify the distribution pattern by calculating the skewness, kurtosis, and Shannon entropy of the functional connectivity network weights. Finally, the results of the simulation were validated in a normative database. We observed that: 1) The functional connection weights exhibit a right-skewed distribution, and are not influenced by channel density or coupling measures; 2) The functional connection weights exhibit a relatively uniform distribution, with the potential for volume conduction to affect the degree of uniformity in the distribution; 3) Networks constructed using coupling measures influenced by volume conduction exhibit significant correlations between the average connection weight and measures of skewness, kurtosis, and Shannon entropy. This study contributes to a deeper understanding of RSNs, providing valuable insights for research in the field of neuroscience, and holds promise for being associated with brain cognition and disease diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07394v2-abstract-full').style.display = 'none'; document.getElementById('2501.07394v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05464">arXiv:2501.05464</a> <span> [<a href="https://arxiv.org/pdf/2501.05464">pdf</a>, <a href="https://arxiv.org/format/2501.05464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> LLM-MedQA: Enhancing Medical Question Answering through Case Studies in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hui Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yineng Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Ching-Sheng Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinrong Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05464v2-abstract-short" style="display: inline;"> Accurate and efficient question-answering systems are essential for delivering high-quality patient care in the medical field. While Large Language Models (LLMs) have made remarkable strides across various domains, they continue to face significant challenges in medical question answering, particularly in understanding domain-specific terminologies and performing complex reasoning. These limitatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05464v2-abstract-full').style.display = 'inline'; document.getElementById('2501.05464v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05464v2-abstract-full" style="display: none;"> Accurate and efficient question-answering systems are essential for delivering high-quality patient care in the medical field. While Large Language Models (LLMs) have made remarkable strides across various domains, they continue to face significant challenges in medical question answering, particularly in understanding domain-specific terminologies and performing complex reasoning. These limitations undermine their effectiveness in critical medical applications. To address these issues, we propose a novel approach incorporating similar case generation within a multi-agent medical question-answering (MedQA) system. Specifically, we leverage the Llama3.1:70B model, a state-of-the-art LLM, in a multi-agent architecture to enhance performance on the MedQA dataset using zero-shot learning. Our method capitalizes on the model's inherent medical knowledge and reasoning capabilities, eliminating the need for additional training data. Experimental results show substantial performance gains over existing benchmark models, with improvements of 7% in both accuracy and F1-score across various medical QA tasks. Furthermore, we examine the model's interpretability and reliability in addressing complex medical queries. This research not only offers a robust solution for medical question answering but also establishes a foundation for broader applications of LLMs in the medical domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05464v2-abstract-full').style.display = 'none'; document.getElementById('2501.05464v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05004">arXiv:2501.05004</a> <span> [<a href="https://arxiv.org/pdf/2501.05004">pdf</a>, <a href="https://arxiv.org/format/2501.05004">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TAFE.2025.3528403">10.1109/TAFE.2025.3528403 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Fast Path-Planning Method for Continuous Harvesting of Table-Top Grown Strawberries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+Z">Zhonghua Miao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lichao Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shimin Hu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Ya Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05004v1-abstract-short" style="display: inline;"> Continuous harvesting and storage of multiple fruits in a single operation allow robots to significantly reduce the travel distance required for repetitive back-and-forth movements. Traditional collision-free path planning algorithms, such as Rapidly-Exploring Random Tree (RRT) and A-star (A), often fail to meet the demands of efficient continuous fruit harvesting due to their low search efficienc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05004v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05004v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05004v1-abstract-full" style="display: none;"> Continuous harvesting and storage of multiple fruits in a single operation allow robots to significantly reduce the travel distance required for repetitive back-and-forth movements. Traditional collision-free path planning algorithms, such as Rapidly-Exploring Random Tree (RRT) and A-star (A), often fail to meet the demands of efficient continuous fruit harvesting due to their low search efficiency and the generation of excessive redundant points. This paper presents the Interactive Local Minima Search Algorithm (ILMSA), a fast path-planning method designed for the continuous harvesting of table-top grown strawberries. The algorithm featured an interactive node expansion strategy that iteratively extended and refined collision-free path segments based on local minima points. To enable the algorithm to function in 3D, the 3D environment was projected onto multiple 2D planes, generating optimal paths on each plane. The best path was then selected, followed by integrating and smoothing the 3D path segments. Simulations demonstrated that ILMSA outperformed existing methods, reducing path length by 21.5% and planning time by 97.1% compared to 3D-RRT, while achieving 11.6% shorter paths and 25.4% fewer nodes than the Lowest Point of the Strawberry (LPS) algorithm in 3D environments. In 2D, ILMSA achieved path lengths 16.2% shorter than A, 23.4% shorter than RRT, and 20.9% shorter than RRT-Connect, while being over 96% faster and generating significantly fewer nodes. Field tests confirmed ILMSA's suitability for complex agricultural tasks, having a combined planning and execution time and an average path length that were approximately 58% and 69%, respectively, of those achieved by the LPS algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05004v1-abstract-full').style.display = 'none'; document.getElementById('2501.05004v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on AgriFood Electronics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04379">arXiv:2501.04379</a> <span> [<a href="https://arxiv.org/pdf/2501.04379">pdf</a>, <a href="https://arxiv.org/format/2501.04379">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Phone-purity Guided Discrete Tokens for Dysarthric Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04379v1-abstract-short" style="display: inline;"> Discrete tokens extracted provide efficient and domain adaptable speech features. Their application to disordered speech that exhibits articulation imprecision and large mismatch against normal voice remains unexplored. To improve their phonetic discrimination that is weakened during unsupervised K-means or vector quantization of continuous features, this paper proposes novel phone-purity guided (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04379v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04379v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04379v1-abstract-full" style="display: none;"> Discrete tokens extracted provide efficient and domain adaptable speech features. Their application to disordered speech that exhibits articulation imprecision and large mismatch against normal voice remains unexplored. To improve their phonetic discrimination that is weakened during unsupervised K-means or vector quantization of continuous features, this paper proposes novel phone-purity guided (PPG) discrete tokens for dysarthric speech recognition. Phonetic label supervision is used to regularize maximum likelihood and reconstruction error costs used in standard K-means and VAE-VQ based discrete token extraction. Experiments conducted on the UASpeech corpus suggest that the proposed PPG discrete token features extracted from HuBERT consistently outperform hybrid TDNN and End-to-End (E2E) Conformer systems using non-PPG based K-means or VAE-VQ tokens across varying codebook sizes by statistically significant word error rate (WER) reductions up to 0.99\% and 1.77\% absolute (3.21\% and 4.82\% relative) respectively on the UASpeech test set of 16 dysarthric speakers. The lowest WER of 23.25\% was obtained by combining systems using different token features. Consistent improvements on the phone purity metric were also achieved. T-SNE visualization further demonstrates sharper decision boundaries were produced between K-means/VAE-VQ clusters after introducing phone-purity guidance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04379v1-abstract-full').style.display = 'none'; document.getElementById('2501.04379v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03643">arXiv:2501.03643</a> <span> [<a href="https://arxiv.org/pdf/2501.03643">pdf</a>, <a href="https://arxiv.org/format/2501.03643">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Effective and Efficient Mixed Precision Quantization of Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+H">Haoning Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huimeng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Youjun Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03643v2-abstract-short" style="display: inline;"> This paper presents a novel mixed-precision quantization approach for speech foundation models that tightly integrates mixed-precision learning and quantized model parameter estimation into one single model compression stage. Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base and HuBERT-large models suggest the resulting mixed-precision quantized models increased the loss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03643v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03643v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03643v2-abstract-full" style="display: none;"> This paper presents a novel mixed-precision quantization approach for speech foundation models that tightly integrates mixed-precision learning and quantized model parameter estimation into one single model compression stage. Experiments conducted on LibriSpeech dataset with fine-tuned wav2vec2.0-base and HuBERT-large models suggest the resulting mixed-precision quantized models increased the lossless compression ratio by factors up to 1.7x and 1.9x over the respective uniform-precision and two-stage mixed-precision quantized baselines that perform precision learning and model parameters quantization in separate and disjointed stages, while incurring no statistically word error rate (WER) increase over the 32-bit full-precision models. The system compression time of wav2vec2.0-base and HuBERT-large models is reduced by up to 1.9 and 1.5 times over the two-stage mixed-precision baselines, while both produce lower WERs. The best-performing 3.5-bit mixed-precision quantized HuBERT-large model produces a lossless compression ratio of 8.6x over the 32-bit full-precision system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03643v2-abstract-full').style.display = 'none'; document.getElementById('2501.03643v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at IEEE ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03182">arXiv:2501.03182</a> <span> [<a href="https://arxiv.org/pdf/2501.03182">pdf</a>, <a href="https://arxiv.org/format/2501.03182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3690624.3709212">10.1145/3690624.3709212 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Boosting Explainability through Selective Rationalization in Pre-trained Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Libing Yuan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuaibo Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kui Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Le Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03182v1-abstract-short" style="display: inline;"> The widespread application of pre-trained language models (PLMs) in natural language processing (NLP) has led to increasing concerns about their explainability. Selective rationalization is a self-explanatory framework that selects human-intelligible input subsets as rationales for predictions. Recent studies have shown that applying existing rationalization frameworks to PLMs will result in sever… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03182v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03182v1-abstract-full" style="display: none;"> The widespread application of pre-trained language models (PLMs) in natural language processing (NLP) has led to increasing concerns about their explainability. Selective rationalization is a self-explanatory framework that selects human-intelligible input subsets as rationales for predictions. Recent studies have shown that applying existing rationalization frameworks to PLMs will result in severe degeneration and failure problems, producing sub-optimal or meaningless rationales. Such failures severely damage trust in rationalization methods and constrain the application of rationalization techniques on PLMs. In this paper, we find that the homogeneity of tokens in the sentences produced by PLMs is the primary contributor to these problems. To address these challenges, we propose a method named Pre-trained Language Model's Rationalization (PLMR), which splits PLMs into a generator and a predictor to deal with NLP tasks while providing interpretable rationales. The generator in PLMR also alleviates homogeneity by pruning irrelevant tokens, while the predictor uses full-text information to standardize predictions. Experiments conducted on two widely used datasets across multiple PLMs demonstrate the effectiveness of the proposed method PLMR in addressing the challenge of applying selective rationalization to PLMs. Codes: https://github.com/ylb777/PLMR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03182v1-abstract-full').style.display = 'none'; document.getElementById('2501.03182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">KDD 2025 research track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02450">arXiv:2501.02450</a> <span> [<a href="https://arxiv.org/pdf/2501.02450">pdf</a>, <a href="https://arxiv.org/format/2501.02450">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GCP: Guarded Collaborative Perception with Spatial-Temporal Aware Malicious Agent Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yihang Tao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yue Hu</a>, <a href="/search/cs?searchtype=author&query=An%2C+H">Haonan An</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+H">Hangcheng Cao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02450v1-abstract-short" style="display: inline;"> Collaborative perception significantly enhances autonomous driving safety by extending each vehicle's perception range through message sharing among connected and autonomous vehicles. Unfortunately, it is also vulnerable to adversarial message attacks from malicious agents, resulting in severe performance degradation. While existing defenses employ hypothesis-and-verification frameworks to detect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02450v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02450v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02450v1-abstract-full" style="display: none;"> Collaborative perception significantly enhances autonomous driving safety by extending each vehicle's perception range through message sharing among connected and autonomous vehicles. Unfortunately, it is also vulnerable to adversarial message attacks from malicious agents, resulting in severe performance degradation. While existing defenses employ hypothesis-and-verification frameworks to detect malicious agents based on single-shot outliers, they overlook temporal message correlations, which can be circumvented by subtle yet harmful perturbations in model input and output spaces. This paper reveals a novel blind area confusion (BAC) attack that compromises existing single-shot outlier-based detection methods. As a countermeasure, we propose GCP, a Guarded Collaborative Perception framework based on spatial-temporal aware malicious agent detection, which maintains single-shot spatial consistency through a confidence-scaled spatial concordance loss, while simultaneously examining temporal anomalies by reconstructing historical bird's eye view motion flows in low-confidence regions. We also employ a joint spatial-temporal Benjamini-Hochberg test to synthesize dual-domain anomaly results for reliable malicious agent detection. Extensive experiments demonstrate GCP's superior performance under diverse attack scenarios, achieving up to 34.69% improvements in AP@0.5 compared to the state-of-the-art CP defense strategies under BAC attacks, while maintaining consistent 5-8% improvements under other typical attacks. Code will be released at https://github.com/CP-Security/GCP.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02450v1-abstract-full').style.display = 'none'; document.getElementById('2501.02450v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00192">arXiv:2501.00192</a> <span> [<a href="https://arxiv.org/pdf/2501.00192">pdf</a>, <a href="https://arxiv.org/format/2501.00192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MLLM-as-a-Judge for Image Safety without Human Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuming Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaowen Lin</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuowei Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Subramanyam%2C+H">Harihar Subramanyam</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfa Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+L">Lingjuan Lyu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shiqing Ma</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A">Ankit Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00192v1-abstract-short" style="display: inline;"> Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00192v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00192v1-abstract-full" style="display: none;"> Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal Large Language Models (MLLMs) offer potential in this regard, given their strong pattern recognition abilities. Existing approaches typically fine-tune MLLMs with human-labeled datasets, which however brings a series of drawbacks. First, relying on human annotators to label data following intricate and detailed guidelines is both expensive and labor-intensive. Furthermore, users of safety judgment systems may need to frequently update safety rules, making fine-tuning on human-based annotation more challenging. This raises the research question: Can we detect unsafe images by querying MLLMs in a zero-shot setting using a predefined safety constitution (a set of safety rules)? Our research showed that simply querying pre-trained MLLMs does not yield satisfactory results. This lack of effectiveness stems from factors such as the subjectivity of safety rules, the complexity of lengthy constitutions, and the inherent biases in the models. To address these challenges, we propose a MLLM-based method includes objectifying safety rules, assessing the relevance between rules and images, making quick judgments based on debiased token probabilities with logically complete yet simplified precondition chains for safety rules, and conducting more in-depth reasoning with cascaded chain-of-thought processes if necessary. Experiment results demonstrate that our method is highly effective for zero-shot image safety judgment tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00192v1-abstract-full').style.display = 'none'; document.getElementById('2501.00192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20796">arXiv:2412.20796</a> <span> [<a href="https://arxiv.org/pdf/2412.20796">pdf</a>, <a href="https://arxiv.org/format/2412.20796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FastCHGNet: Training one Universal Interatomic Potential to 1.5 Hours with 32 GPUs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanchang Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Siyu Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lin-Wang Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+G">Guangming Tan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Weile Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20796v1-abstract-short" style="display: inline;"> Graph neural network universal interatomic potentials (GNN-UIPs) have demonstrated remarkable generalization and transfer capabilities in material discovery and property prediction. These models can accelerate molecular dynamics (MD) simulation by several orders of magnitude while maintaining \textit{ab initio} accuracy, making them a promising new paradigm in material simulations. One notable exa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20796v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20796v1-abstract-full" style="display: none;"> Graph neural network universal interatomic potentials (GNN-UIPs) have demonstrated remarkable generalization and transfer capabilities in material discovery and property prediction. These models can accelerate molecular dynamics (MD) simulation by several orders of magnitude while maintaining \textit{ab initio} accuracy, making them a promising new paradigm in material simulations. One notable example is Crystal Hamiltonian Graph Neural Network (CHGNet), pretrained on the energies, forces, stresses, and magnetic moments from the MPtrj dataset, representing a state-of-the-art GNN-UIP model for charge-informed MD simulations. However, training the CHGNet model is time-consuming(8.3 days on one A100 GPU) for three reasons: (i) requiring multi-layer propagation to reach more distant atom information, (ii) requiring second-order derivatives calculation to finish weights updating and (iii) the implementation of reference CHGNet does not fully leverage the computational capabilities. This paper introduces FastCHGNet, an optimized CHGNet, with three contributions: Firstly, we design innovative Force/Stress Readout modules to decompose Force/Stress prediction. Secondly, we adopt massive optimizations such as kernel fusion, redundancy bypass, etc, to exploit GPU computation power sufficiently. Finally, we extend CHGNet to support multiple GPUs and propose a load-balancing technique to enhance GPU utilization. Numerical results show that FastCHGNet reduces memory footprint by a factor of 3.59. The final training time of FastCHGNet can be decreased to \textbf{1.53 hours} on 32 GPUs without sacrificing model accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20796v1-abstract-full').style.display = 'none'; document.getElementById('2412.20796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19648">arXiv:2412.19648</a> <span> [<a href="https://arxiv.org/pdf/2412.19648">pdf</a>, <a href="https://arxiv.org/format/2412.19648">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Vision-Language Tracking by Effectively Converting Textual Cues into Visual Cues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+X">X. Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">D. Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">S. Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">X. Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">M. Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">J. Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">X. Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">K. Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19648v1-abstract-short" style="display: inline;"> Vision-Language Tracking (VLT) aims to localize a target in video sequences using a visual template and language description. While textual cues enhance tracking potential, current datasets typically contain much more image data than text, limiting the ability of VLT methods to align the two modalities effectively. To address this imbalance, we propose a novel plug-and-play method named CTVLT that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19648v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19648v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19648v1-abstract-full" style="display: none;"> Vision-Language Tracking (VLT) aims to localize a target in video sequences using a visual template and language description. While textual cues enhance tracking potential, current datasets typically contain much more image data than text, limiting the ability of VLT methods to align the two modalities effectively. To address this imbalance, we propose a novel plug-and-play method named CTVLT that leverages the strong text-image alignment capabilities of foundation grounding models. CTVLT converts textual cues into interpretable visual heatmaps, which are easier for trackers to process. Specifically, we design a textual cue mapping module that transforms textual cues into target distribution heatmaps, visually representing the location described by the text. Additionally, the heatmap guidance module fuses these heatmaps with the search image to guide tracking more effectively. Extensive experiments on mainstream benchmarks demonstrate the effectiveness of our approach, achieving state-of-the-art performance and validating the utility of our method for enhanced VLT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19648v1-abstract-full').style.display = 'none'; document.getElementById('2412.19648v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP '25 ! Code: https://github.com/XiaokunFeng/CTVLT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19589">arXiv:2412.19589</a> <span> [<a href="https://arxiv.org/pdf/2412.19589">pdf</a>, <a href="https://arxiv.org/ps/2412.19589">ps</a>, <a href="https://arxiv.org/format/2412.19589">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> ViDTA: Enhanced Drug-Target Affinity Prediction via Virtual Graph Nodes and Attention-based Feature Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Minghui Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zikang Guo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yang Wu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Peijin Guo</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yao Shi</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengqing Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19589v1-abstract-short" style="display: inline;"> Drug-target interaction is fundamental in understanding how drugs affect biological systems, and accurately predicting drug-target affinity (DTA) is vital for drug discovery. Recently, deep learning methods have emerged as a significant approach for estimating the binding strength between drugs and target proteins. However, existing methods simply utilize the drug's local information from molecula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19589v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19589v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19589v1-abstract-full" style="display: none;"> Drug-target interaction is fundamental in understanding how drugs affect biological systems, and accurately predicting drug-target affinity (DTA) is vital for drug discovery. Recently, deep learning methods have emerged as a significant approach for estimating the binding strength between drugs and target proteins. However, existing methods simply utilize the drug's local information from molecular topology rather than global information. Additionally, the features of drugs and proteins are usually fused with a simple concatenation operation, limiting their effectiveness. To address these challenges, we proposed ViDTA, an enhanced DTA prediction framework. We introduce virtual nodes into the Graph Neural Network (GNN)-based drug feature extraction network, which acts as a global memory to exchange messages more efficiently. By incorporating virtual graph nodes, we seamlessly integrate local and global features of drug molecular structures, expanding the GNN's receptive field. Additionally, we propose an attention-based linear feature fusion network for better capturing the interaction information between drugs and proteins. Experimental results evaluated on various benchmarks including Davis, Metz, and KIBA demonstrate that our proposed ViDTA outperforms the state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19589v1-abstract-full').style.display = 'none'; document.getElementById('2412.19589v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by International Conference on Bioinformatics and Biomedicine (BIBM 24)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19537">arXiv:2412.19537</a> <span> [<a href="https://arxiv.org/pdf/2412.19537">pdf</a>, <a href="https://arxiv.org/format/2412.19537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Finger in Camera Speaks Everything: Unconstrained Air-Writing for Real-World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meiqi Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaiqi Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanqiang Cai</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiyu Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuzhong Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19537v1-abstract-short" style="display: inline;"> Air-writing is a challenging task that combines the fields of computer vision and natural language processing, offering an intuitive and natural approach for human-computer interaction. However, current air-writing solutions face two primary challenges: (1) their dependency on complex sensors (e.g., Radar, EEGs and others) for capturing precise handwritten trajectories, and (2) the absence of a vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19537v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19537v1-abstract-full" style="display: none;"> Air-writing is a challenging task that combines the fields of computer vision and natural language processing, offering an intuitive and natural approach for human-computer interaction. However, current air-writing solutions face two primary challenges: (1) their dependency on complex sensors (e.g., Radar, EEGs and others) for capturing precise handwritten trajectories, and (2) the absence of a video-based air-writing dataset that covers a comprehensive vocabulary range. These limitations impede their practicality in various real-world scenarios, including the use on devices like iPhones and laptops. To tackle these challenges, we present the groundbreaking air-writing Chinese character video dataset (AWCV-100K-UCAS2024), serving as a pioneering benchmark for video-based air-writing. This dataset captures handwritten trajectories in various real-world scenarios using commonly accessible RGB cameras, eliminating the need for complex sensors. AWCV-100K-UCAS2024 includes 8.8 million video frames, encompassing the complete set of 3,755 characters from the GB2312-80 level-1 set (GB1). Furthermore, we introduce our baseline approach, the video-based character recognizer (VCRec). VCRec adeptly extracts fingertip features from sparse visual cues and employs a spatio-temporal sequence module for analysis. Experimental results showcase the superior performance of VCRec compared to existing models in recognizing air-written characters, both quantitatively and qualitatively. This breakthrough paves the way for enhanced human-computer interaction in real-world contexts. Moreover, our approach leverages affordable RGB cameras, enabling its applicability in a diverse range of scenarios. The code and data examples will be made public at https://github.com/wmeiqi/AWCV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19537v1-abstract-full').style.display = 'none'; document.getElementById('2412.19537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19279">arXiv:2412.19279</a> <span> [<a href="https://arxiv.org/pdf/2412.19279">pdf</a>, <a href="https://arxiv.org/format/2412.19279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Improving Generalization for AI-Synthesized Voice Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+H">Hainan Ren</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chun-Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shu Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19279v2-abstract-short" style="display: inline;"> AI-synthesized voice technology has the potential to create realistic human voices for beneficial applications, but it can also be misused for malicious purposes. While existing AI-synthesized voice detection models excel in intra-domain evaluation, they face challenges in generalizing across different domains, potentially becoming obsolete as new voice generators emerge. Current solutions use div… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19279v2-abstract-full').style.display = 'inline'; document.getElementById('2412.19279v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19279v2-abstract-full" style="display: none;"> AI-synthesized voice technology has the potential to create realistic human voices for beneficial applications, but it can also be misused for malicious purposes. While existing AI-synthesized voice detection models excel in intra-domain evaluation, they face challenges in generalizing across different domains, potentially becoming obsolete as new voice generators emerge. Current solutions use diverse data and advanced machine learning techniques (e.g., domain-invariant representation, self-supervised learning), but are limited by predefined vocoders and sensitivity to factors like background noise and speaker identity. In this work, we introduce an innovative disentanglement framework aimed at extracting domain-agnostic artifact features related to vocoders. Utilizing these features, we enhance model learning in a flat loss landscape, enabling escape from suboptimal solutions and improving generalization. Extensive experiments on benchmarks show our approach outperforms state-of-the-art methods, achieving up to 5.12% improvement in the equal error rate metric in intra-domain and 7.59% in cross-domain evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19279v2-abstract-full').style.display = 'none'; document.getElementById('2412.19279v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18832">arXiv:2412.18832</a> <span> [<a href="https://arxiv.org/pdf/2412.18832">pdf</a>, <a href="https://arxiv.org/format/2412.18832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Structured Speaker-Deficiency Adaptation of Foundation Models for Dysarthric and Elderly Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guinan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18832v1-abstract-short" style="display: inline;"> Data-intensive fine-tuning of speech foundation models (SFMs) to scarce and diverse dysarthric and elderly speech leads to data bias and poor generalization to unseen speakers. This paper proposes novel structured speaker-deficiency adaptation approaches for SSL pre-trained SFMs on such data. Speaker and speech deficiency invariant SFMs were constructed in their supervised adaptive fine-tuning sta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18832v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18832v1-abstract-full" style="display: none;"> Data-intensive fine-tuning of speech foundation models (SFMs) to scarce and diverse dysarthric and elderly speech leads to data bias and poor generalization to unseen speakers. This paper proposes novel structured speaker-deficiency adaptation approaches for SSL pre-trained SFMs on such data. Speaker and speech deficiency invariant SFMs were constructed in their supervised adaptive fine-tuning stage to reduce undue bias to training data speakers, and serves as a more neutral and robust starting point for test time unsupervised adaptation. Speech variability attributed to speaker identity and speech impairment severity, or aging induced neurocognitive decline, are modelled using separate adapters that can be combined together to model any seen or unseen speaker. Experiments on the UASpeech dysarthric and DementiaBank Pitt elderly speech corpora suggest structured speaker-deficiency adaptation of HuBERT and Wav2vec2-conformer models consistently outperforms baseline SFMs using either: a) no adapters; b) global adapters shared among all speakers; or c) single attribute adapters modelling speaker or deficiency labels alone by statistically significant WER reductions up to 3.01% and 1.50% absolute (10.86% and 6.94% relative) on the two tasks respectively. The lowest published WER of 19.45% (49.34% on very low intelligibility, 33.17% on unseen words) is obtained on the UASpeech test set of 16 dysarthric speakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18832v1-abstract-full').style.display = 'none'; document.getElementById('2412.18832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18619">arXiv:2412.18619</a> <span> [<a href="https://arxiv.org/pdf/2412.18619">pdf</a>, <a href="https://arxiv.org/format/2412.18619">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Next Token Prediction Towards Multimodal Intelligence: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zekun Wang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+S">Shuhuai Ren</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Z">Zefan Cai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongcheng Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yizhe Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Ruoyu Wu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Q">Qingxiu Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulong Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shuai Bai</a>, <a href="/search/cs?searchtype=author&query=Vlachos%2C+A">Andreas Vlachos</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Wen Xiao</a>, <a href="/search/cs?searchtype=author&query=Yee%2C+A">Aaron Yee</a> , et al. (2 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18619v2-abstract-short" style="display: inline;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18619v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18619v2-abstract-full" style="display: none;"> Building on the foundations of language modeling in natural language processing, Next Token Prediction (NTP) has evolved into a versatile training objective for machine learning tasks across various modalities, achieving considerable success. As Large Language Models (LLMs) have advanced to unify understanding and generation tasks within the textual modality, recent research has shown that tasks from different modalities can also be effectively encapsulated within the NTP framework, transforming the multimodal information into tokens and predict the next one given the context. This survey introduces a comprehensive taxonomy that unifies both understanding and generation within multimodal learning through the lens of NTP. The proposed taxonomy covers five key aspects: Multimodal tokenization, MMNTP model architectures, unified task representation, datasets \& evaluation, and open challenges. This new taxonomy aims to aid researchers in their exploration of multimodal intelligence. An associated GitHub repository collecting the latest papers and repos is available at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18619v2-abstract-full').style.display = 'none'; document.getElementById('2412.18619v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">69 papes, 18 figures, repo at https://github.com/LMM101/Awesome-Multimodal-Next-Token-Prediction</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18125">arXiv:2412.18125</a> <span> [<a href="https://arxiv.org/pdf/2412.18125">pdf</a>, <a href="https://arxiv.org/format/2412.18125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Exact Acceleration of Subgraph Graph Neural Networks by Eliminating Computation Redundancy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+Q">Qian Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Muhan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuxian Hu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenyuan Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18125v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have become a prevalent framework for graph tasks. Many recent studies have proposed the use of graph convolution methods over the numerous subgraphs of each graph, a concept known as subgraph graph neural networks (subgraph GNNs), to enhance GNNs' ability to distinguish non-isomorphic graphs. To maximize the expressiveness, subgraph GNNs often require each subgraph to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18125v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18125v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have become a prevalent framework for graph tasks. Many recent studies have proposed the use of graph convolution methods over the numerous subgraphs of each graph, a concept known as subgraph graph neural networks (subgraph GNNs), to enhance GNNs' ability to distinguish non-isomorphic graphs. To maximize the expressiveness, subgraph GNNs often require each subgraph to have equal size to the original graph. Despite their impressive performance, subgraph GNNs face challenges due to the vast number and large size of subgraphs which lead to a surge in training data, resulting in both storage and computational inefficiencies. In response to this problem, this paper introduces Ego-Nets-Fit-All (ENFA), a model that uniformly takes the smaller ego nets as subgraphs, thereby providing greater storage and computational efficiency, while at the same time guarantees identical outputs to the original subgraph GNNs even taking the whole graph as subgraphs. The key is to identify and eliminate the redundant computation among subgraphs. For example, a node $v_i$ may appear in multiple subgraphs but is far away from all of their centers (the unsymmetric part between subgraphs). Therefore, its first few rounds of message passing within each subgraph can be computed once in the original graph instead of being computed multiple times within each subgraph. Such strategy enables our ENFA to accelerate subgraph GNNs in an exact way, unlike previous sampling approaches that often lose the performance. Extensive experiments across various datasets reveal that compared with the conventional subgraph GNNs, ENFA can reduce storage space by 29.0% to 84.5% and improve training efficiency by up to 1.66x. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18125v1-abstract-full').style.display = 'none'; document.getElementById('2412.18125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16958">arXiv:2412.16958</a> <span> [<a href="https://arxiv.org/pdf/2412.16958">pdf</a>, <a href="https://arxiv.org/format/2412.16958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Breaking Barriers in Physical-World Adversarial Examples: Improving Robustness and Transferability via Robust Feature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yichen Wang</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+Y">Yuxuan Chou</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hangtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minghui Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16958v1-abstract-short" style="display: inline;"> As deep neural networks (DNNs) are widely applied in the physical world, many researches are focusing on physical-world adversarial examples (PAEs), which introduce perturbations to inputs and cause the model's incorrect outputs. However, existing PAEs face two challenges: unsatisfactory attack performance (i.e., poor transferability and insufficient robustness to environment conditions), and diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16958v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16958v1-abstract-full" style="display: none;"> As deep neural networks (DNNs) are widely applied in the physical world, many researches are focusing on physical-world adversarial examples (PAEs), which introduce perturbations to inputs and cause the model's incorrect outputs. However, existing PAEs face two challenges: unsatisfactory attack performance (i.e., poor transferability and insufficient robustness to environment conditions), and difficulty in balancing attack effectiveness with stealthiness, where better attack effectiveness often makes PAEs more perceptible. In this paper, we explore a novel perturbation-based method to overcome the challenges. For the first challenge, we introduce a strategy Deceptive RF injection based on robust features (RFs) that are predictive, robust to perturbations, and consistent across different models. Specifically, it improves the transferability and robustness of PAEs by covering RFs of other classes onto the predictive features in clean images. For the second challenge, we introduce another strategy Adversarial Semantic Pattern Minimization, which removes most perturbations and retains only essential adversarial patterns in AEsBased on the two strategies, we design our method Robust Feature Coverage Attack (RFCoA), comprising Robust Feature Disentanglement and Adversarial Feature Fusion. In the first stage, we extract target class RFs in feature space. In the second stage, we use attention-based feature fusion to overlay these RFs onto predictive features of clean images and remove unnecessary perturbations. Experiments show our method's superior transferability, robustness, and stealthiness compared to existing state-of-the-art methods. Additionally, our method's effectiveness can extend to Large Vision-Language Models (LVLMs), indicating its potential applicability to more complex tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16958v1-abstract-full').style.display = 'none'; document.getElementById('2412.16958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16955">arXiv:2412.16955</a> <span> [<a href="https://arxiv.org/pdf/2412.16955">pdf</a>, <a href="https://arxiv.org/format/2412.16955">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NumbOD: A Spatial-Frequency Fusion Attack Against Object Detectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yufei Song</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhifei Yu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+Y">Leo Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+D">Dezhong Yao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16955v1-abstract-short" style="display: inline;"> With the advancement of deep learning, object detectors (ODs) with various architectures have achieved significant success in complex scenarios like autonomous driving. Previous adversarial attacks against ODs have been focused on designing customized attacks targeting their specific structures (e.g., NMS and RPN), yielding some results but simultaneously constraining their scalability. Moreover,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16955v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16955v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16955v1-abstract-full" style="display: none;"> With the advancement of deep learning, object detectors (ODs) with various architectures have achieved significant success in complex scenarios like autonomous driving. Previous adversarial attacks against ODs have been focused on designing customized attacks targeting their specific structures (e.g., NMS and RPN), yielding some results but simultaneously constraining their scalability. Moreover, most efforts against ODs stem from image-level attacks originally designed for classification tasks, resulting in redundant computations and disturbances in object-irrelevant areas (e.g., background). Consequently, how to design a model-agnostic efficient attack to comprehensively evaluate the vulnerabilities of ODs remains challenging and unresolved. In this paper, we propose NumbOD, a brand-new spatial-frequency fusion attack against various ODs, aimed at disrupting object detection within images. We directly leverage the features output by the OD without relying on its internal structures to craft adversarial examples. Specifically, we first design a dual-track attack target selection strategy to select high-quality bounding boxes from OD outputs for targeting. Subsequently, we employ directional perturbations to shift and compress predicted boxes and change classification results to deceive ODs. Additionally, we focus on manipulating the high-frequency components of images to confuse ODs' attention on critical objects, thereby enhancing the attack efficiency. Our extensive experiments on nine ODs and two datasets show that NumbOD achieves powerful attack performance and high stealthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16955v1-abstract-full').style.display = 'none'; document.getElementById('2412.16955v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16720">arXiv:2412.16720</a> <span> [<a href="https://arxiv.org/pdf/2412.16720">pdf</a>, <a href="https://arxiv.org/format/2412.16720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OpenAI o1 System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Jaech%2C+A">Aaron Jaech</a>, <a href="/search/cs?searchtype=author&query=Kalai%2C+A">Adam Kalai</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+A">Adam Richardson</a>, <a href="/search/cs?searchtype=author&query=El-Kishky%2C+A">Ahmed El-Kishky</a>, <a href="/search/cs?searchtype=author&query=Low%2C+A">Aiden Low</a>, <a href="/search/cs?searchtype=author&query=Helyar%2C+A">Alec Helyar</a>, <a href="/search/cs?searchtype=author&query=Madry%2C+A">Aleksander Madry</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Iftimie%2C+A">Alex Iftimie</a>, <a href="/search/cs?searchtype=author&query=Karpenko%2C+A">Alex Karpenko</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Neitz%2C+A">Alexander Neitz</a>, <a href="/search/cs?searchtype=author&query=Prokofiev%2C+A">Alexander Prokofiev</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+A">Alexander Wei</a>, <a href="/search/cs?searchtype=author&query=Tam%2C+A">Allison Tam</a>, <a href="/search/cs?searchtype=author&query=Bennett%2C+A">Ally Bennett</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Ananya Kumar</a>, <a href="/search/cs?searchtype=author&query=Saraiva%2C+A">Andre Saraiva</a>, <a href="/search/cs?searchtype=author&query=Vallone%2C+A">Andrea Vallone</a>, <a href="/search/cs?searchtype=author&query=Duberstein%2C+A">Andrew Duberstein</a>, <a href="/search/cs?searchtype=author&query=Kondrich%2C+A">Andrew Kondrich</a> , et al. (238 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16720v1-abstract-short" style="display: inline;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16720v1-abstract-full" style="display: none;"> The o1 model series is trained with large-scale reinforcement learning to reason using chain of thought. These advanced reasoning capabilities provide new avenues for improving the safety and robustness of our models. In particular, our models can reason about our safety policies in context when responding to potentially unsafe prompts, through deliberative alignment. This leads to state-of-the-art performance on certain benchmarks for risks such as generating illicit advice, choosing stereotyped responses, and succumbing to known jailbreaks. Training models to incorporate a chain of thought before answering has the potential to unlock substantial benefits, while also increasing potential risks that stem from heightened intelligence. Our results underscore the need for building robust alignment methods, extensively stress-testing their efficacy, and maintaining meticulous risk management protocols. This report outlines the safety work carried out for the OpenAI o1 and OpenAI o1-mini models, including safety evaluations, external red teaming, and Preparedness Framework evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16720v1-abstract-full').style.display = 'none'; document.getElementById('2412.16720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16651">arXiv:2412.16651</a> <span> [<a href="https://arxiv.org/pdf/2412.16651">pdf</a>, <a href="https://arxiv.org/format/2412.16651">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PB-UAP: Hybrid Universal Adversarial Attack For Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yufei Song</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Ziqi Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minghui Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianlong Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hangtao Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+M">Menghao Deng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+W">Wei Wan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shengshan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+Y">Leo Yu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16651v2-abstract-short" style="display: inline;"> With the rapid advancement of deep learning, the model robustness has become a significant research hotspot, \ie, adversarial attacks on deep neural networks. Existing works primarily focus on image classification tasks, aiming to alter the model's predicted labels. Due to the output complexity and deeper network architectures, research on adversarial examples for segmentation models is still limi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16651v2-abstract-full').style.display = 'inline'; document.getElementById('2412.16651v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16651v2-abstract-full" style="display: none;"> With the rapid advancement of deep learning, the model robustness has become a significant research hotspot, \ie, adversarial attacks on deep neural networks. Existing works primarily focus on image classification tasks, aiming to alter the model's predicted labels. Due to the output complexity and deeper network architectures, research on adversarial examples for segmentation models is still limited, particularly for universal adversarial perturbations. In this paper, we propose a novel universal adversarial attack method designed for segmentation models, which includes dual feature separation and low-frequency scattering modules. The two modules guide the training of adversarial examples in the pixel and frequency space, respectively. Experiments demonstrate that our method achieves high attack success rates surpassing the state-of-the-art methods, and exhibits strong transferability across different models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16651v2-abstract-full').style.display = 'none'; document.getElementById('2412.16651v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15496">arXiv:2412.15496</a> <span> [<a href="https://arxiv.org/pdf/2412.15496">pdf</a>, <a href="https://arxiv.org/format/2412.15496">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding When and Why Graph Attention Mechanisms Work via Node Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zhongtian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiaosheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bocheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15496v1-abstract-short" style="display: inline;"> Despite the growing popularity of graph attention mechanisms, their theoretical understanding remains limited. This paper aims to explore the conditions under which these mechanisms are effective in node classification tasks through the lens of Contextual Stochastic Block Models (CSBMs). Our theoretical analysis reveals that incorporating graph attention mechanisms is \emph{not universally benefic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15496v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15496v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15496v1-abstract-full" style="display: none;"> Despite the growing popularity of graph attention mechanisms, their theoretical understanding remains limited. This paper aims to explore the conditions under which these mechanisms are effective in node classification tasks through the lens of Contextual Stochastic Block Models (CSBMs). Our theoretical analysis reveals that incorporating graph attention mechanisms is \emph{not universally beneficial}. Specifically, by appropriately defining \emph{structure noise} and \emph{feature noise} in graphs, we show that graph attention mechanisms can enhance classification performance when structure noise exceeds feature noise. Conversely, when feature noise predominates, simpler graph convolution operations are more effective. Furthermore, we examine the over-smoothing phenomenon and show that, in the high signal-to-noise ratio (SNR) regime, graph convolutional networks suffer from over-smoothing, whereas graph attention mechanisms can effectively resolve this issue. Building on these insights, we propose a novel multi-layer Graph Attention Network (GAT) architecture that significantly outperforms single-layer GATs in achieving \emph{perfect node classification} in CSBMs, relaxing the SNR requirement from $ 蠅(\sqrt{\log n}) $ to $ 蠅(\sqrt{\log n} / \sqrt[3]{n}) $. To our knowledge, this is the first study to delineate the conditions for perfect node classification using multi-layer GATs. Our theoretical contributions are corroborated by extensive experiments on both synthetic and real-world datasets, highlighting the practical implications of our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15496v1-abstract-full').style.display = 'none'; document.getElementById('2412.15496v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13420">arXiv:2412.13420</a> <span> [<a href="https://arxiv.org/pdf/2412.13420">pdf</a>, <a href="https://arxiv.org/format/2412.13420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> BotSim: LLM-Powered Malicious Social Botnet Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiao%2C+B">Boyu Qiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Wei Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qianqian Lu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Songlin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13420v1-abstract-short" style="display: inline;"> Social media platforms like X(Twitter) and Reddit are vital to global communication. However, advancements in Large Language Model (LLM) technology give rise to social media bots with unprecedented intelligence. These bots adeptly simulate human profiles, conversations, and interactions, disseminating large amounts of false information and posing significant challenges to platform regulation. To b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13420v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13420v1-abstract-full" style="display: none;"> Social media platforms like X(Twitter) and Reddit are vital to global communication. However, advancements in Large Language Model (LLM) technology give rise to social media bots with unprecedented intelligence. These bots adeptly simulate human profiles, conversations, and interactions, disseminating large amounts of false information and posing significant challenges to platform regulation. To better understand and counter these threats, we innovatively design BotSim, a malicious social botnet simulation powered by LLM. BotSim mimics the information dissemination patterns of real-world social networks, creating a virtual environment composed of intelligent agent bots and real human users. In the temporal simulation constructed by BotSim, these advanced agent bots autonomously engage in social interactions such as posting and commenting, effectively modeling scenarios of information flow and user interaction. Building on the BotSim framework, we construct a highly human-like, LLM-driven bot dataset called BotSim-24 and benchmark multiple bot detection strategies against it. The experimental results indicate that detection methods effective on traditional bot datasets perform worse on BotSim-24, highlighting the urgent need for new detection strategies to address the cybersecurity threats posed by these advanced bots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13420v1-abstract-full').style.display = 'none'; document.getElementById('2412.13420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12626">arXiv:2412.12626</a> <span> [<a href="https://arxiv.org/pdf/2412.12626">pdf</a>, <a href="https://arxiv.org/format/2412.12626">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Improving the Transferability of 3D Point Cloud Attack via Spectral-aware Admix and Optimization Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shiyu Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Daizong Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wei Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12626v1-abstract-short" style="display: inline;"> Deep learning models for point clouds have shown to be vulnerable to adversarial attacks, which have received increasing attention in various safety-critical applications such as autonomous driving, robotics, and surveillance. Existing 3D attackers generally design various attack strategies in the white-box setting, requiring the prior knowledge of 3D model details. However, real-world 3D applicat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12626v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12626v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12626v1-abstract-full" style="display: none;"> Deep learning models for point clouds have shown to be vulnerable to adversarial attacks, which have received increasing attention in various safety-critical applications such as autonomous driving, robotics, and surveillance. Existing 3D attackers generally design various attack strategies in the white-box setting, requiring the prior knowledge of 3D model details. However, real-world 3D applications are in the black-box setting, where we can only acquire the outputs of the target classifier. Although few recent works try to explore the black-box attack, they still achieve limited attack success rates (ASR). To alleviate this issue, this paper focuses on attacking the 3D models in a transfer-based black-box setting, where we first carefully design adversarial examples in a white-box surrogate model and then transfer them to attack other black-box victim models. Specifically, we propose a novel Spectral-aware Admix with Augmented Optimization method (SAAO) to improve the adversarial transferability. In particular, since traditional Admix strategy are deployed in the 2D domain that adds pixel-wise images for perturbing, we can not directly follow it to merge point clouds in coordinate domain as it will destroy the geometric shapes. Therefore, we design spectral-aware fusion that performs Graph Fourier Transform (GFT) to get spectral features of the point clouds and add them in the spectral domain. Afterward, we run a few steps with spectral-aware weighted Admix to select better optimization paths as well as to adjust corresponding learning weights. At last, we run more steps to generate adversarial spectral feature along the optimization path and perform Inverse-GFT on the adversarial spectral feature to obtain the adversarial example in the data domain. Experiments show that our SAAO achieves better transferability compared to existing 3D attack methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12626v1-abstract-full').style.display = 'none'; document.getElementById('2412.12626v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12000">arXiv:2412.12000</a> <span> [<a href="https://arxiv.org/pdf/2412.12000">pdf</a>, <a href="https://arxiv.org/format/2412.12000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CP-Guard: Malicious Agent Detection and Defense in Collaborative Bird's Eye View Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+S">Senkang Hu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yihang Tao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+G">Guowen Xu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yiqin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuguang Fang</a>, <a href="/search/cs?searchtype=author&query=Kwong%2C+S">Sam Kwong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12000v1-abstract-short" style="display: inline;"> Collaborative Perception (CP) has shown a promising technique for autonomous driving, where multiple connected and autonomous vehicles (CAVs) share their perception information to enhance the overall perception performance and expand the perception range. However, in CP, ego CAV needs to receive messages from its collaborators, which makes it easy to be attacked by malicious agents. For example, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12000v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12000v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12000v1-abstract-full" style="display: none;"> Collaborative Perception (CP) has shown a promising technique for autonomous driving, where multiple connected and autonomous vehicles (CAVs) share their perception information to enhance the overall perception performance and expand the perception range. However, in CP, ego CAV needs to receive messages from its collaborators, which makes it easy to be attacked by malicious agents. For example, a malicious agent can send harmful information to the ego CAV to mislead it. To address this critical issue, we propose a novel method, \textbf{CP-Guard}, a tailored defense mechanism for CP that can be deployed by each agent to accurately detect and eliminate malicious agents in its collaboration network. Our key idea is to enable CP to reach a consensus rather than a conflict against the ego CAV's perception results. Based on this idea, we first develop a probability-agnostic sample consensus (PASAC) method to effectively sample a subset of the collaborators and verify the consensus without prior probabilities of malicious agents. Furthermore, we define a collaborative consistency loss (CCLoss) to capture the discrepancy between the ego CAV and its collaborators, which is used as a verification criterion for consensus. Finally, we conduct extensive experiments in collaborative bird's eye view (BEV) tasks and our results demonstrate the effectiveness of our CP-Guard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12000v1-abstract-full').style.display = 'none'; document.getElementById('2412.12000v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI'25</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Hu%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository