Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 453 results for author: <span class="mathjax">Yin, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yin%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yin, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yin%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yin, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yin%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13441">arXiv:2502.13441</a> <span> [<a href="https://arxiv.org/pdf/2502.13441">pdf</a>, <a href="https://arxiv.org/format/2502.13441">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Self-Improvement Paradox: Can Language Models Bootstrap Reasoning Capabilities without External Scaffolding? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yutao Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingshuai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiancheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruochen Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zilun Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13441v1-abstract-short" style="display: inline;"> Self-improving large language models (LLMs) -- i.e., to improve the performance of an LLM by fine-tuning it with synthetic data generated by itself -- is a promising way to advance the capabilities of LLMs while avoiding extensive supervision. Existing approaches to self-improvement often rely on external supervision signals in the form of seed data and/or assistance from third-party models. This… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13441v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13441v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13441v1-abstract-full" style="display: none;"> Self-improving large language models (LLMs) -- i.e., to improve the performance of an LLM by fine-tuning it with synthetic data generated by itself -- is a promising way to advance the capabilities of LLMs while avoiding extensive supervision. Existing approaches to self-improvement often rely on external supervision signals in the form of seed data and/or assistance from third-party models. This paper presents Crescent -- a simple yet effective framework for generating high-quality synthetic question-answer data in a fully autonomous manner. Crescent first elicits the LLM to generate raw questions via a bait prompt, then diversifies these questions leveraging a rejection sampling-based self-deduplication, and finally feeds the questions to the LLM and collects the corresponding answers by means of majority voting. We show that Crescent sheds light on the potential of true self-improvement with zero external supervision signals for math reasoning; in particular, Crescent-generated question-answer pairs suffice to (i) improve the reasoning capabilities of an LLM while preserving its general performance (especially in the 0-shot setting); and (ii) distil LLM knowledge to weaker models more effectively than existing methods based on seed-dataset augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13441v1-abstract-full').style.display = 'none'; document.getElementById('2502.13441v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05790">arXiv:2502.05790</a> <span> [<a href="https://arxiv.org/pdf/2502.05790">pdf</a>, <a href="https://arxiv.org/format/2502.05790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> I3S: Importance Sampling Subspace Selection for Low-Rank Optimization in LLM Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haochen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junze Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanchu Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+A">Anshumali Shrivastava</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a>, <a href="/search/cs?searchtype=author&query=Braverman%2C+V">Vladimir Braverman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05790v1-abstract-short" style="display: inline;"> Low-rank optimization has emerged as a promising approach to enabling memory-efficient training of large language models (LLMs). Existing low-rank optimization methods typically project gradients onto a low-rank subspace, reducing the memory cost of storing optimizer states. A key challenge in these methods is identifying suitable subspaces to ensure an effective optimization trajectory. Most exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05790v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05790v1-abstract-full" style="display: none;"> Low-rank optimization has emerged as a promising approach to enabling memory-efficient training of large language models (LLMs). Existing low-rank optimization methods typically project gradients onto a low-rank subspace, reducing the memory cost of storing optimizer states. A key challenge in these methods is identifying suitable subspaces to ensure an effective optimization trajectory. Most existing approaches select the dominant subspace to preserve gradient information, as this intuitively provides the best approximation. However, we find that in practice, the dominant subspace stops changing during pretraining, thereby constraining weight updates to similar subspaces. In this paper, we propose importance sampling subspace selection (I3S) for low-rank optimization, which theoretically offers a comparable convergence rate to the dominant subspace approach. Empirically, we demonstrate that I3S significantly outperforms previous methods in LLM pretraining tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05790v1-abstract-full').style.display = 'none'; document.getElementById('2502.05790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19034">arXiv:2501.19034</a> <span> [<a href="https://arxiv.org/pdf/2501.19034">pdf</a>, <a href="https://arxiv.org/format/2501.19034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> XRF V2: A Dataset for Action Summarization with Wi-Fi Signals, and IMUs in Phones, Watches, Earbuds, and Glasses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+B">Bo Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pei Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jiaxi Yin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yunpeng Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Han Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jinsong Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19034v1-abstract-short" style="display: inline;"> Human Action Recognition (HAR) plays a crucial role in applications such as health monitoring, smart home automation, and human-computer interaction. While HAR has been extensively studied, action summarization, which involves identifying and summarizing continuous actions, remains an emerging task. This paper introduces the novel XRF V2 dataset, designed for indoor daily activity Temporal Action… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19034v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19034v1-abstract-full" style="display: none;"> Human Action Recognition (HAR) plays a crucial role in applications such as health monitoring, smart home automation, and human-computer interaction. While HAR has been extensively studied, action summarization, which involves identifying and summarizing continuous actions, remains an emerging task. This paper introduces the novel XRF V2 dataset, designed for indoor daily activity Temporal Action Localization (TAL) and action summarization. XRF V2 integrates multimodal data from Wi-Fi signals, IMU sensors (smartphones, smartwatches, headphones, and smart glasses), and synchronized video recordings, offering a diverse collection of indoor activities from 16 volunteers across three distinct environments. To tackle TAL and action summarization, we propose the XRFMamba neural network, which excels at capturing long-term dependencies in untrimmed sensory sequences and outperforms state-of-the-art methods, such as ActionFormer and WiFiTAD. We envision XRF V2 as a valuable resource for advancing research in human action localization, action forecasting, pose estimation, multimodal foundation models pre-training, synthetic data generation, and more. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19034v1-abstract-full').style.display = 'none'; document.getElementById('2501.19034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 11 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15194">arXiv:2501.15194</a> <span> [<a href="https://arxiv.org/pdf/2501.15194">pdf</a>, <a href="https://arxiv.org/format/2501.15194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Reliable Pseudo-labeling via Optimal Transport with Attention for Short Text Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhihao Yao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jixuan Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15194v3-abstract-short" style="display: inline;"> Short text clustering has gained significant attention in the data mining community. However, the limited valuable information contained in short texts often leads to low-discriminative representations, increasing the difficulty of clustering. This paper proposes a novel short text clustering framework, called Reliable \textbf{P}seudo-labeling via \textbf{O}ptimal \textbf{T}ransport with \textbf{A… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15194v3-abstract-full').style.display = 'inline'; document.getElementById('2501.15194v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15194v3-abstract-full" style="display: none;"> Short text clustering has gained significant attention in the data mining community. However, the limited valuable information contained in short texts often leads to low-discriminative representations, increasing the difficulty of clustering. This paper proposes a novel short text clustering framework, called Reliable \textbf{P}seudo-labeling via \textbf{O}ptimal \textbf{T}ransport with \textbf{A}ttention for Short Text Clustering (\textbf{POTA}), that generate reliable pseudo-labels to aid discriminative representation learning for clustering. Specially, \textbf{POTA} first implements an instance-level attention mechanism to capture the semantic relationships among samples, which are then incorporated as a semantic consistency regularization term into an optimal transport problem. By solving this OT problem, we can yield reliable pseudo-labels that simultaneously account for sample-to-sample semantic consistency and sample-to-cluster global structure information. Additionally, the proposed OT can adaptively estimate cluster distributions, making \textbf{POTA} well-suited for varying degrees of imbalanced datasets. Then, we utilize the pseudo-labels to guide contrastive learning to generate discriminative representations and achieve efficient clustering. Extensive experiments demonstrate \textbf{POTA} outperforms state-of-the-art methods. The code is available at: \href{https://github.com/YZH0905/POTA-STC/tree/main}{https://github.com/YZH0905/POTA-STC/tree/main}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15194v3-abstract-full').style.display = 'none'; document.getElementById('2501.15194v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12418">arXiv:2501.12418</a> <span> [<a href="https://arxiv.org/pdf/2501.12418">pdf</a>, <a href="https://arxiv.org/format/2501.12418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ImageRef-VL: Enabling Contextual Image Referencing in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+J">Jingwei Yi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junhao Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Ju Xu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+P">Peng Bao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongliang Wang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12418v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) have demonstrated remarkable capabilities in understanding multimodal inputs and have been widely integrated into Retrieval-Augmented Generation (RAG) based conversational systems. While current VLM-powered chatbots can provide textual source references in their responses, they exhibit significant limitations in referencing contextually relevant images during conversa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12418v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12418v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12418v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) have demonstrated remarkable capabilities in understanding multimodal inputs and have been widely integrated into Retrieval-Augmented Generation (RAG) based conversational systems. While current VLM-powered chatbots can provide textual source references in their responses, they exhibit significant limitations in referencing contextually relevant images during conversations. In this paper, we introduce Contextual Image Reference -- the ability to appropriately reference relevant images from retrieval documents based on conversation context -- and systematically investigate VLMs' capability in this aspect. We conduct the first evaluation for contextual image referencing, comprising a dedicated testing dataset and evaluation metrics. Furthermore, we propose ImageRef-VL, a method that significantly enhances open-source VLMs' image referencing capabilities through instruction fine-tuning on a large-scale, manually curated multimodal conversation dataset. Experimental results demonstrate that ImageRef-VL not only outperforms proprietary models but also achieves an 88% performance improvement over state-of-the-art open-source VLMs in contextual image referencing tasks. Our code is available at https://github.com/bytedance/ImageRef-VL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12418v1-abstract-full').style.display = 'none'; document.getElementById('2501.12418v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09804">arXiv:2501.09804</a> <span> [<a href="https://arxiv.org/pdf/2501.09804">pdf</a>, <a href="https://arxiv.org/format/2501.09804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Generalization in Chain of Thought Reasoning for Smaller Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+M+J">Maxwell J. Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+D">Dingyi Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongbing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+C">Charles Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09804v1-abstract-short" style="display: inline;"> Chain-of-Thought (CoT) reasoning in smaller language models is a challenging natural language process problem yet highly desirable in many real-life applications. Existing CoT knowledge distillation methods often suffer from overly conservative memorization in smaller LLMs, leading to low generalization confidence. As fully preserving the CoT ability of teacher model is impossible, we hypothesize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09804v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09804v1-abstract-full" style="display: none;"> Chain-of-Thought (CoT) reasoning in smaller language models is a challenging natural language process problem yet highly desirable in many real-life applications. Existing CoT knowledge distillation methods often suffer from overly conservative memorization in smaller LLMs, leading to low generalization confidence. As fully preserving the CoT ability of teacher model is impossible, we hypothesize that adversarial CoT fine-tuning is crucial for developing smaller LLM with robust CoT generalization. To this end, we propose \textit{PRompt-Assisted Domain-Adversarial fine-tuning} (PRADA), a principled fine-tuning framework that integrates diverse CoT domains. Specifically, PRADA pioneers two CoT improvements in smaller LLM: (1) Recovering the domain-invariant feature insight which typically lost during distillation with domain adversarial fine-tuning; (2) Enhancing the domain adaptability of CoT prompt engineering by employing domain-adversarial approaches. We theoretically demonstrate the effectiveness of our approach and empirically show that it significantly outperforms the state of the arts in a wide range of tasks. Moreover, our empirical findings reveal that the smaller LLM, when leveraging PRADA, aligns closely with domain knowledge, thereby improving the explainability of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09804v1-abstract-full').style.display = 'none'; document.getElementById('2501.09804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04606">arXiv:2501.04606</a> <span> [<a href="https://arxiv.org/pdf/2501.04606">pdf</a>, <a href="https://arxiv.org/format/2501.04606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low-Cost Video Editing with Lightweight Adaptors and Temporal-Aware Inversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Y">Yangfan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sida Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianhui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binxu Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianyu Shi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Miao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04606v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image (T2I) generation using diffusion models have enabled cost-effective video-editing applications by leveraging pre-trained models, eliminating the need for resource-intensive training. However, the frame-independence of T2I generation often results in poor temporal consistency. Existing methods address this issue through temporal layer fine-tuning or inference-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04606v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04606v1-abstract-full" style="display: none;"> Recent advancements in text-to-image (T2I) generation using diffusion models have enabled cost-effective video-editing applications by leveraging pre-trained models, eliminating the need for resource-intensive training. However, the frame-independence of T2I generation often results in poor temporal consistency. Existing methods address this issue through temporal layer fine-tuning or inference-based temporal propagation, but these approaches suffer from high training costs or limited temporal coherence. To address these challenges, we propose a General and Efficient Adapter (GE-Adapter) that integrates temporal-spatial and semantic consistency with Baliteral DDIM inversion. This framework introduces three key components: (1) Frame-based Temporal Consistency Blocks (FTC Blocks) to capture frame-specific features and enforce smooth inter-frame transitions via temporally-aware loss functions; (2) Channel-dependent Spatial Consistency Blocks (SCD Blocks) employing bilateral filters to enhance spatial coherence by reducing noise and artifacts; and (3) Token-based Semantic Consistency Module (TSC Module) to maintain semantic alignment using shared prompt tokens and frame-specific tokens. Our method significantly improves perceptual quality, text-image alignment, and temporal coherence, as demonstrated on the MSR-VTT dataset. Additionally, it achieves enhanced fidelity and frame-to-frame coherence, offering a practical solution for T2V editing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04606v1-abstract-full').style.display = 'none'; document.getElementById('2501.04606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04453">arXiv:2501.04453</a> <span> [<a href="https://arxiv.org/pdf/2501.04453">pdf</a>, <a href="https://arxiv.org/format/2501.04453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Gradient Purification: Defense Against Poisoning Attack in Decentralized Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+X">Xiaoye Miao</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yongheng Shang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shuiguang Deng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04453v1-abstract-short" style="display: inline;"> Decentralized federated learning (DFL) is inherently vulnerable to poisoning attacks, as malicious clients can transmit manipulated model gradients to neighboring clients. Existing defense methods either reject suspicious gradients per iteration or restart DFL aggregation after detecting all malicious clients. They overlook the potential accuracy benefit from the discarded malicious gradients. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04453v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04453v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04453v1-abstract-full" style="display: none;"> Decentralized federated learning (DFL) is inherently vulnerable to poisoning attacks, as malicious clients can transmit manipulated model gradients to neighboring clients. Existing defense methods either reject suspicious gradients per iteration or restart DFL aggregation after detecting all malicious clients. They overlook the potential accuracy benefit from the discarded malicious gradients. In this paper, we propose a novel gradient purification defense, named GPD, that integrates seamlessly with existing DFL aggregation to defend against poisoning attacks. It aims to mitigate the harm in model gradients while retaining the benefit in model weights for enhancing accuracy. For each benign client in GPD, a recording variable is designed to track the historically aggregated gradients from one of its neighbors. It allows benign clients to precisely detect malicious neighbors and swiftly mitigate aggregated malicious gradients via historical consistency checks. Upon mitigation, GPD optimizes model weights via aggregating gradients solely from benign clients. This retains the previously beneficial portions from malicious clients and exploits the contributions from benign clients, thereby significantly enhancing the model accuracy. We analyze the convergence of GPD, as well as its ability to harvest high accuracy. Extensive experiments over three datasets demonstrate that, GPD is capable of mitigating poisoning attacks under both iid and non-iid data distributions. It significantly outperforms state-of-the-art defenses in terms of accuracy against various poisoning attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04453v1-abstract-full').style.display = 'none'; document.getElementById('2501.04453v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03292">arXiv:2501.03292</a> <span> [<a href="https://arxiv.org/pdf/2501.03292">pdf</a>, <a href="https://arxiv.org/format/2501.03292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal One-Shot Federated Ensemble Learning for Medical Data with Vision Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+N">Naibo Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuchen Deng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+S">Shichen Fan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S">See-Kiong Ng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03292v1-abstract-short" style="display: inline;"> Federated learning (FL) has attracted considerable interest in the medical domain due to its capacity to facilitate collaborative model training while maintaining data privacy. However, conventional FL methods typically necessitate multiple communication rounds, leading to significant communication overhead and delays, especially in environments with limited bandwidth. One-shot federated learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03292v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03292v1-abstract-full" style="display: none;"> Federated learning (FL) has attracted considerable interest in the medical domain due to its capacity to facilitate collaborative model training while maintaining data privacy. However, conventional FL methods typically necessitate multiple communication rounds, leading to significant communication overhead and delays, especially in environments with limited bandwidth. One-shot federated learning addresses these issues by conducting model training and aggregation in a single communication round, thereby reducing communication costs while preserving privacy. Among these, one-shot federated ensemble learning combines independently trained client models using ensemble techniques such as voting, further boosting performance in non-IID data scenarios. On the other hand, existing machine learning methods in healthcare predominantly use unimodal data (e.g., medical images or textual reports), which restricts their diagnostic accuracy and comprehensiveness. Therefore, the integration of multi-modal data is proposed to address these shortcomings. In this paper, we introduce FedMME, an innovative one-shot multi-modal federated ensemble learning framework that utilizes multi-modal data for medical image analysis. Specifically, FedMME capitalizes on vision large language models to produce textual reports from medical images, employs a BERT model to extract textual features from these reports, and amalgamates these features with visual features to improve diagnostic accuracy. Experimental results show that our method demonstrated superior performance compared to existing one-shot federated learning methods in healthcare scenarios across four datasets with various data distributions. For instance, it surpasses existing one-shot federated learning approaches by more than 17.5% in accuracy on the RSNA dataset when applying a Dirichlet distribution with ($伪$ = 0.3). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03292v1-abstract-full').style.display = 'none'; document.getElementById('2501.03292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20799">arXiv:2412.20799</a> <span> [<a href="https://arxiv.org/pdf/2412.20799">pdf</a>, <a href="https://arxiv.org/format/2412.20799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> SFE-Net: Harnessing Biological Principles of Differential Gene Expression for Improved Feature Selection in Deep Learning Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuqi Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yuanzhong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaoxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianjun Yin</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+H">Haojun Fei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20799v1-abstract-short" style="display: inline;"> In the realm of DeepFake detection, the challenge of adapting to various synthesis methodologies such as Faceswap, Deepfakes, Face2Face, and NeuralTextures significantly impacts the performance of traditional machine learning models. These models often suffer from static feature representation, which struggles to perform consistently across diversely generated deepfake datasets. Inspired by the bi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20799v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20799v1-abstract-full" style="display: none;"> In the realm of DeepFake detection, the challenge of adapting to various synthesis methodologies such as Faceswap, Deepfakes, Face2Face, and NeuralTextures significantly impacts the performance of traditional machine learning models. These models often suffer from static feature representation, which struggles to perform consistently across diversely generated deepfake datasets. Inspired by the biological concept of differential gene expression, where gene activation is dynamically regulated in response to environmental stimuli, we introduce the Selective Feature Expression Network (SFE-Net). This innovative framework integrates selective feature activation principles into deep learning architectures, allowing the model to dynamically adjust feature priorities in response to varying deepfake generation techniques. SFE-Net employs a novel mechanism that selectively enhances critical features essential for accurately detecting forgeries, while reducing the impact of irrelevant or misleading cues akin to adaptive evolutionary processes in nature. Through rigorous testing on a range of deepfake datasets, SFE-Net not only surpasses existing static models in detecting sophisticated forgeries but also shows enhanced generalization capabilities in cross-dataset scenarios. Our approach significantly mitigates overfitting by maintaining a dynamic balance between feature exploration and exploitation, thus producing more robust and effective deepfake detection models. This bio-inspired strategy paves the way for developing adaptive deep learning systems that are finely tuned to address the nuanced challenges posed by the varied nature of digital forgeries in modern digital forensics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20799v1-abstract-full').style.display = 'none'; document.getElementById('2412.20799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,3 figures,2 charts,conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20249">arXiv:2412.20249</a> <span> [<a href="https://arxiv.org/pdf/2412.20249">pdf</a>, <a href="https://arxiv.org/format/2412.20249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Next-Gen Interconnection Systems with Compute Express Link: a Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guanjie Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuesheng Xu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shuiguang Deng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20249v1-abstract-short" style="display: inline;"> Interconnection is crucial for computing systems. However, the current interconnection performance between processors and devices, such as memory devices and accelerators, significantly lags behind their computing performance, severely limiting the overall performance. To address this challenge, Intel proposes Compute Express Link (CXL), an open industry-standard interconnection. With memory seman… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20249v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20249v1-abstract-full" style="display: none;"> Interconnection is crucial for computing systems. However, the current interconnection performance between processors and devices, such as memory devices and accelerators, significantly lags behind their computing performance, severely limiting the overall performance. To address this challenge, Intel proposes Compute Express Link (CXL), an open industry-standard interconnection. With memory semantics, CXL offers low-latency, scalable, and coherent interconnection between processors and devices. This paper introduces recent advances in CXL-based interconnection systems with memory semantics. We classify the existing research into three categories: Pooling Memory, Distributed Shared Memory, and Unified Memory. Pooling Memory interconnects processors and memory, aims to address memory wall challenge. Distributed shared memory interconnects processors across nodes, aims to synchronize the cluster. Unified memory interconnects processors and accelerators, aims to enhance collaboration in heterogeneous computing systems. Finally, we discuss the future research and envision memory-centric computing with CXL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20249v1-abstract-full').style.display = 'none'; document.getElementById('2412.20249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18426">arXiv:2412.18426</a> <span> [<a href="https://arxiv.org/pdf/2412.18426">pdf</a>, <a href="https://arxiv.org/format/2412.18426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GUI Testing Arena: A Unified Benchmark for Advancing Autonomous GUI Testing Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kangjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiahui Song</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+L">Leigang Sha</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Haozhan Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiancheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiubo Liang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18426v1-abstract-short" style="display: inline;"> Nowadays, research on GUI agents is a hot topic in the AI community. However, current research focuses on GUI task automation, limiting the scope of applications in various GUI scenarios. In this paper, we propose a formalized and comprehensive environment to evaluate the entire process of automated GUI Testing (GTArena), offering a fair, standardized environment for consistent operation of divers… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18426v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18426v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18426v1-abstract-full" style="display: none;"> Nowadays, research on GUI agents is a hot topic in the AI community. However, current research focuses on GUI task automation, limiting the scope of applications in various GUI scenarios. In this paper, we propose a formalized and comprehensive environment to evaluate the entire process of automated GUI Testing (GTArena), offering a fair, standardized environment for consistent operation of diverse multimodal large language models. We divide the testing process into three key subtasks: test intention generation, test task execution, and GUI defect detection, and construct a benchmark dataset based on these to conduct a comprehensive evaluation. It evaluates the performance of different models using three data types: real mobile applications, mobile applications with artificially injected defects, and synthetic data, thoroughly assessing their capabilities in this relevant task. Additionally, we propose a method that helps researchers explore the correlation between the performance of multimodal language large models in specific scenarios and their general capabilities in standard benchmark tests. Experimental results indicate that even the most advanced models struggle to perform well across all sub-tasks of automated GUI Testing, highlighting a significant gap between the current capabilities of Autonomous GUI Testing and its practical, real-world applicability. This gap provides guidance for the future direction of GUI Agent development. Our code is available at https://github.com/ZJU-ACES-ISE/ChatUITest. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18426v1-abstract-full').style.display = 'none'; document.getElementById('2412.18426v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17331">arXiv:2412.17331</a> <span> [<a href="https://arxiv.org/pdf/2412.17331">pdf</a>, <a href="https://arxiv.org/format/2412.17331">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty-Participation Context Consistency Learning for Semi-supervised Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianjian Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhichao Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Junsheng Zhou</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yanhui Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17331v2-abstract-short" style="display: inline;"> Semi-supervised semantic segmentation has attracted considerable attention for its ability to mitigate the reliance on extensive labeled data. However, existing consistency regularization methods only utilize high certain pixels with prediction confidence surpassing a fixed threshold for training, failing to fully leverage the potential supervisory information within the network. Therefore, this p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17331v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17331v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17331v2-abstract-full" style="display: none;"> Semi-supervised semantic segmentation has attracted considerable attention for its ability to mitigate the reliance on extensive labeled data. However, existing consistency regularization methods only utilize high certain pixels with prediction confidence surpassing a fixed threshold for training, failing to fully leverage the potential supervisory information within the network. Therefore, this paper proposes the Uncertainty-participation Context Consistency Learning (UCCL) method to explore richer supervisory signals. Specifically, we first design the semantic backpropagation update (SBU) strategy to fully exploit the knowledge from uncertain pixel regions, enabling the model to learn consistent pixel-level semantic information from those areas. Furthermore, we propose the class-aware knowledge regulation (CKR) module to facilitate the regulation of class-level semantic features across different augmented views, promoting consistent learning of class-level semantic information within the encoder. Experimental results on two public benchmarks demonstrate that our proposed method achieves state-of-the-art performance. Our code is available at https://github.com/YUKEKEJAN/UCCL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17331v2-abstract-full').style.display = 'none'; document.getElementById('2412.17331v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To be published in ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17226">arXiv:2412.17226</a> <span> [<a href="https://arxiv.org/pdf/2412.17226">pdf</a>, <a href="https://arxiv.org/format/2412.17226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> OLiDM: Object-aware LiDAR Diffusion Models for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+T">Tianyi Yan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junbo Yin</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+X">Xianpeng Lang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+R">Ruigang Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Cheng-Zhong Xu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jianbing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17226v1-abstract-short" style="display: inline;"> To enhance autonomous driving safety in complex scenarios, various methods have been proposed to simulate LiDAR point cloud data. Nevertheless, these methods often face challenges in producing high-quality, diverse, and controllable foreground objects. To address the needs of object-aware tasks in 3D perception, we introduce OLiDM, a novel framework capable of generating high-fidelity LiDAR data a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17226v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17226v1-abstract-full" style="display: none;"> To enhance autonomous driving safety in complex scenarios, various methods have been proposed to simulate LiDAR point cloud data. Nevertheless, these methods often face challenges in producing high-quality, diverse, and controllable foreground objects. To address the needs of object-aware tasks in 3D perception, we introduce OLiDM, a novel framework capable of generating high-fidelity LiDAR data at both the object and the scene levels. OLiDM consists of two pivotal components: the Object-Scene Progressive Generation (OPG) module and the Object Semantic Alignment (OSA) module. OPG adapts to user-specific prompts to generate desired foreground objects, which are subsequently employed as conditions in scene generation, ensuring controllable outputs at both the object and scene levels. This also facilitates the association of user-defined object-level annotations with the generated LiDAR scenes. Moreover, OSA aims to rectify the misalignment between foreground objects and background scenes, enhancing the overall quality of the generated objects. The broad effectiveness of OLiDM is demonstrated across various LiDAR generation tasks, as well as in 3D perception tasks. Specifically, on the KITTI-360 dataset, OLiDM surpasses prior state-of-the-art methods such as UltraLiDAR by 17.5 in FPD. Additionally, in sparse-to-dense LiDAR completion, OLiDM achieves a significant improvement over LiDARGen, with a 57.47\% increase in semantic IoU. Moreover, OLiDM enhances the performance of mainstream 3D detectors by 2.4\% in mAP and 1.9\% in NDS, underscoring its potential in advancing object-aware 3D tasks. Code is available at: https://yanty123.github.io/OLiDM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17226v1-abstract-full').style.display = 'none'; document.getElementById('2412.17226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025, https://yanty123.github.io/OLiDM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12808">arXiv:2412.12808</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Detecting Emotional Incongruity of Sarcasm by Commonsense Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Z">Ziqi Qiu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jianxing Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yufeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+H">Hanjiang Lai</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+Y">Yanghui Rao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Q">Qinliang Su</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jian Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12808v2-abstract-short" style="display: inline;"> This paper focuses on sarcasm detection, which aims to identify whether given statements convey criticism, mockery, or other negative sentiment opposite to the literal meaning. To detect sarcasm, humans often require a comprehensive understanding of the semantics in the statement and even resort to external commonsense to infer the fine-grained incongruity. However, existing methods lack commonsen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12808v2-abstract-full').style.display = 'inline'; document.getElementById('2412.12808v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12808v2-abstract-full" style="display: none;"> This paper focuses on sarcasm detection, which aims to identify whether given statements convey criticism, mockery, or other negative sentiment opposite to the literal meaning. To detect sarcasm, humans often require a comprehensive understanding of the semantics in the statement and even resort to external commonsense to infer the fine-grained incongruity. However, existing methods lack commonsense inferential ability when they face complex real-world scenarios, leading to unsatisfactory performance. To address this problem, we propose a novel framework for sarcasm detection, which conducts incongruity reasoning based on commonsense augmentation, called EICR. Concretely, we first employ retrieval-augmented large language models to supplement the missing but indispensable commonsense background knowledge. To capture complex contextual associations, we construct a dependency graph and obtain the optimized topology via graph refinement. We further introduce an adaptive reasoning skeleton that integrates prior rules to extract sentiment-inconsistent subgraphs explicitly. To eliminate the possible spurious relations between words and labels, we employ adversarial contrastive learning to enhance the robustness of the detector. Experiments conducted on five datasets demonstrate the effectiveness of EICR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12808v2-abstract-full').style.display = 'none'; document.getElementById('2412.12808v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In the experimental chapter, there is a problem with the experimental setting and needs to be corrected</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11139">arXiv:2412.11139</a> <span> [<a href="https://arxiv.org/pdf/2412.11139">pdf</a>, <a href="https://arxiv.org/format/2412.11139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Symbolic Computation">cs.SC</span> </div> </div> <p class="title is-5 mathjax"> ViSymRe: Vision-guided Multimodal Symbolic Regression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Da Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junping Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jin Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinxin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Juan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11139v1-abstract-short" style="display: inline;"> Symbolic regression automatically searches for mathematical equations to reveal underlying mechanisms within datasets, offering enhanced interpretability compared to black box models. Traditionally, symbolic regression has been considered to be purely numeric-driven, with insufficient attention given to the potential contributions of visual information in augmenting this process. When dealing with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11139v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11139v1-abstract-full" style="display: none;"> Symbolic regression automatically searches for mathematical equations to reveal underlying mechanisms within datasets, offering enhanced interpretability compared to black box models. Traditionally, symbolic regression has been considered to be purely numeric-driven, with insufficient attention given to the potential contributions of visual information in augmenting this process. When dealing with high-dimensional and complex datasets, existing symbolic regression models are often inefficient and tend to generate overly complex equations, making subsequent mechanism analysis complicated. In this paper, we propose the vision-guided multimodal symbolic regression model, called ViSymRe, that systematically explores how visual information can improve various metrics of symbolic regression. Compared to traditional models, our proposed model has the following innovations: (1) It integrates three modalities: vision, symbol and numeric to enhance symbolic regression, enabling the model to benefit from the strengths of each modality; (2) It establishes a meta-learning framework that can learn from historical experiences to efficiently solve new symbolic regression problems; (3) It emphasizes the simplicity and structural rationality of the equations rather than merely numerical fitting. Extensive experiments show that our proposed model exhibits strong generalization capability and noise resistance. The equations it generates outperform state-of-the-art numeric-only baselines in terms of fitting effect, simplicity and structural accuracy, thus being able to facilitate accurate mechanism analysis and the development of theoretical models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11139v1-abstract-full').style.display = 'none'; document.getElementById('2412.11139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09126">arXiv:2412.09126</a> <span> [<a href="https://arxiv.org/pdf/2412.09126">pdf</a>, <a href="https://arxiv.org/format/2412.09126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696409.3700225">10.1145/3696409.3700225 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Modality Representation and Alignment for Multimodal Cold-start Active Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+M">Meng Shen</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yake Wei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianxiong Yin</a>, <a href="/search/cs?searchtype=author&query=Rajan%2C+D">Deepu Rajan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+D">Di Hu</a>, <a href="/search/cs?searchtype=author&query=See%2C+S">Simon See</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09126v1-abstract-short" style="display: inline;"> Training multimodal models requires a large amount of labeled data. Active learning (AL) aim to reduce labeling costs. Most AL methods employ warm-start approaches, which rely on sufficient labeled data to train a well-calibrated model that can assess the uncertainty and diversity of unlabeled data. However, when assembling a dataset, labeled data are often scarce initially, leading to a cold-star… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09126v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09126v1-abstract-full" style="display: none;"> Training multimodal models requires a large amount of labeled data. Active learning (AL) aim to reduce labeling costs. Most AL methods employ warm-start approaches, which rely on sufficient labeled data to train a well-calibrated model that can assess the uncertainty and diversity of unlabeled data. However, when assembling a dataset, labeled data are often scarce initially, leading to a cold-start problem. Additionally, most AL methods seldom address multimodal data, highlighting a research gap in this field. Our research addresses these issues by developing a two-stage method for Multi-Modal Cold-Start Active Learning (MMCSAL). Firstly, we observe the modality gap, a significant distance between the centroids of representations from different modalities, when only using cross-modal pairing information as self-supervision signals. This modality gap affects data selection process, as we calculate both uni-modal and cross-modal distances. To address this, we introduce uni-modal prototypes to bridge the modality gap. Secondly, conventional AL methods often falter in multimodal scenarios where alignment between modalities is overlooked. Therefore, we propose enhancing cross-modal alignment through regularization, thereby improving the quality of selected multimodal data pairs in AL. Finally, our experiments demonstrate MMCSAL's efficacy in selecting multimodal data pairs across three multimodal datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09126v1-abstract-full').style.display = 'none'; document.getElementById('2412.09126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, ACMMM Asia 2024, Oral Presentation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05672">arXiv:2412.05672</a> <span> [<a href="https://arxiv.org/pdf/2412.05672">pdf</a>, <a href="https://arxiv.org/format/2412.05672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Graph with Sequence: Broad-Range Semantic Modeling for Fake News Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junwei Yin</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Min Gao</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+K">Kai Shu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wentao Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yinqiu Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zongwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05672v2-abstract-short" style="display: inline;"> The rapid proliferation of fake news on social media threatens social stability, creating an urgent demand for more effective detection methods. While many promising approaches have emerged, most rely on content analysis with limited semantic depth, leading to suboptimal comprehension of news content.To address this limitation, capturing broader-range semantics is essential yet challenging, as it… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05672v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05672v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05672v2-abstract-full" style="display: none;"> The rapid proliferation of fake news on social media threatens social stability, creating an urgent demand for more effective detection methods. While many promising approaches have emerged, most rely on content analysis with limited semantic depth, leading to suboptimal comprehension of news content.To address this limitation, capturing broader-range semantics is essential yet challenging, as it introduces two primary types of noise: fully connecting sentences in news graphs often adds unnecessary structural noise, while highly similar but authenticity-irrelevant sentences introduce feature noise, complicating the detection process. To tackle these issues, we propose BREAK, a broad-range semantics model for fake news detection that leverages a fully connected graph to capture comprehensive semantics while employing dual denoising modules to minimize both structural and feature noise. The semantic structure denoising module balances the graph's connectivity by iteratively refining it between two bounds: a sequence-based structure as a lower bound and a fully connected graph as the upper bound. This refinement uncovers label-relevant semantic interrelations structures. Meanwhile, the semantic feature denoising module reduces noise from similar semantics by diversifying representations, aligning distinct outputs from the denoised graph and sequence encoders using KL-divergence to achieve feature diversification in high-dimensional space. The two modules are jointly optimized in a bi-level framework, enhancing the integration of denoised semantics into a comprehensive representation for detection. Extensive experiments across four datasets demonstrate that BREAK significantly outperforms existing fake news detection methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05672v2-abstract-full').style.display = 'none'; document.getElementById('2412.05672v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01422">arXiv:2412.01422</a> <span> [<a href="https://arxiv.org/pdf/2412.01422">pdf</a>, <a href="https://arxiv.org/format/2412.01422">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MamKPD: A Simple Mamba Baseline for Real-Time 2D Keypoint Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yonghao Dang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+H">Hui Kang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+P">Ping Ye</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01422v1-abstract-short" style="display: inline;"> Real-time 2D keypoint detection plays an essential role in computer vision. Although CNN-based and Transformer-based methods have achieved breakthrough progress, they often fail to deliver superior performance and real-time speed. This paper introduces MamKPD, the first efficient yet effective mamba-based pose estimation framework for 2D keypoint detection. The conventional Mamba module exhibits l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01422v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01422v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01422v1-abstract-full" style="display: none;"> Real-time 2D keypoint detection plays an essential role in computer vision. Although CNN-based and Transformer-based methods have achieved breakthrough progress, they often fail to deliver superior performance and real-time speed. This paper introduces MamKPD, the first efficient yet effective mamba-based pose estimation framework for 2D keypoint detection. The conventional Mamba module exhibits limited information interaction between patches. To address this, we propose a lightweight contextual modeling module (CMM) that uses depth-wise convolutions to model inter-patch dependencies and linear layers to distill the pose cues within each patch. Subsequently, by combining Mamba for global modeling across all patches, MamKPD effectively extracts instances' pose information. We conduct extensive experiments on human and animal pose estimation datasets to validate the effectiveness of MamKPD. Our MamKPD-L achieves 77.3% AP on the COCO dataset with 1492 FPS on an NVIDIA GTX 4090 GPU. Moreover, MamKPD achieves state-of-the-art results on the MPII dataset and competitive results on the AP-10K dataset while saving 85% of the parameters compared to ViTPose. Our project page is available at https://mamkpd.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01422v1-abstract-full').style.display = 'none'; document.getElementById('2412.01422v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19261">arXiv:2411.19261</a> <span> [<a href="https://arxiv.org/pdf/2411.19261">pdf</a>, <a href="https://arxiv.org/format/2411.19261">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving Multi-Subject Consistency in Open-Domain Image Generation with Isolation and Reposition Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+H">Huiguo He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuyue Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuxuan Cai</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+H">Hongyang Chao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jian Yin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Huan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19261v1-abstract-short" style="display: inline;"> Training-free diffusion models have achieved remarkable progress in generating multi-subject consistent images within open-domain scenarios. The key idea of these methods is to incorporate reference subject information within the attention layer. However, existing methods still obtain suboptimal performance when handling numerous subjects. This paper reveals the two primary issues contributing to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19261v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19261v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19261v1-abstract-full" style="display: none;"> Training-free diffusion models have achieved remarkable progress in generating multi-subject consistent images within open-domain scenarios. The key idea of these methods is to incorporate reference subject information within the attention layer. However, existing methods still obtain suboptimal performance when handling numerous subjects. This paper reveals the two primary issues contributing to this deficiency. Firstly, there is undesired interference among different subjects within the target image. Secondly, tokens tend to reference nearby tokens, which reduces the effectiveness of the attention mechanism when there is a significant positional difference between subjects in reference and target images. To address these challenges, we propose a training-free diffusion model with Isolation and Reposition Attention, named IR-Diffusion. Specifically, Isolation Attention ensures that multiple subjects in the target image do not reference each other, effectively eliminating the subject fusion. On the other hand, Reposition Attention involves scaling and repositioning subjects in both reference and target images to the same position within the images. This ensures that subjects in the target image can better reference those in the reference image, thereby maintaining better consistency. Extensive experiments demonstrate that the proposed methods significantly enhance multi-subject consistency, outperforming all existing methods in open-domain scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19261v1-abstract-full').style.display = 'none'; document.getElementById('2411.19261v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17254">arXiv:2411.17254</a> <span> [<a href="https://arxiv.org/pdf/2411.17254">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Semantic Data Augmentation for Long-tailed Facial Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zijian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+B">Bowen Guan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">JianKai Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17254v1-abstract-short" style="display: inline;"> Facial Expression Recognition has a wide application prospect in social robotics, health care, driver fatigue monitoring, and many other practical scenarios. Automatic recognition of facial expressions has been extensively studied by the Computer Vision research society. But Facial Expression Recognition in real-world is still a challenging task, partially due to the long-tailed distribution of th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17254v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17254v1-abstract-full" style="display: none;"> Facial Expression Recognition has a wide application prospect in social robotics, health care, driver fatigue monitoring, and many other practical scenarios. Automatic recognition of facial expressions has been extensively studied by the Computer Vision research society. But Facial Expression Recognition in real-world is still a challenging task, partially due to the long-tailed distribution of the dataset. Many recent studies use data augmentation for Long-Tailed Recognition tasks. In this paper, we propose a novel semantic augmentation method. By introducing randomness into the encoding of the source data in the latent space of VAE-GAN, new samples are generated. Then, for facial expression recognition in RAF-DB dataset, we use our augmentation method to balance the long-tailed distribution. Our method can be used in not only FER tasks, but also more diverse data-hungry scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17254v1-abstract-full').style.display = 'none'; document.getElementById('2411.17254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16044">arXiv:2411.16044</a> <span> [<a href="https://arxiv.org/pdf/2411.16044">pdf</a>, <a href="https://arxiv.org/format/2411.16044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ZoomEye: Enhancing Multimodal LLMs with Human-Like Zooming Capabilities through Tree-Based Image Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+H">Haozhan Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kangjia Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiancheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruochen Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zilun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mingwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16044v1-abstract-short" style="display: inline;"> An image, especially with high-resolution, typically consists of numerous visual elements, ranging from dominant large objects to fine-grained detailed objects. When perceiving such images, multimodal large language models~(MLLMs) face limitations due to the restricted input resolution of the pretrained vision encoder and the cluttered, dense context of the image, resulting in a focus on primary o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16044v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16044v1-abstract-full" style="display: none;"> An image, especially with high-resolution, typically consists of numerous visual elements, ranging from dominant large objects to fine-grained detailed objects. When perceiving such images, multimodal large language models~(MLLMs) face limitations due to the restricted input resolution of the pretrained vision encoder and the cluttered, dense context of the image, resulting in a focus on primary objects while easily overlooking detailed ones. In this paper, we propose Zoom Eye, a tree search algorithm designed to navigate the hierarchical and visual nature of images to capture relevant information. Zoom Eye conceptualizes an image as a tree, with each children node representing a zoomed sub-patch of the parent node and the root represents the overall image. Moreover, Zoom Eye is model-agnostic and training-free, so it enables any MLLMs to simulate human zooming actions by searching along the image tree from root to leaf nodes, seeking out pertinent information, and accurately responding to related queries. We experiment on a series of elaborate high-resolution benchmarks and the results demonstrate that Zoom Eye not only consistently improves the performance of a series base MLLMs with large margin~(e.g., LLaVA-v1.5-7B increases by 34.57\% on $V^*$ Bench and 17.88\% on HR-Bench), but also enables small 7B MLLMs to outperform strong large models such as GPT-4o. Our code is available at \href{https://github.com/om-ai-lab/ZoomEye}{https://github.com/om-ai-lab/ZoomEye}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16044v1-abstract-full').style.display = 'none'; document.getElementById('2411.16044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13144">arXiv:2411.13144</a> <span> [<a href="https://arxiv.org/pdf/2411.13144">pdf</a>, <a href="https://arxiv.org/format/2411.13144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CopyrightMeter: Revisiting Copyright Protection in Text-to-image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+N">Naen Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changjiang Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Minxi Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wenjie Luo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jiacheng Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuyuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Meng Han</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13144v1-abstract-short" style="display: inline;"> Text-to-image diffusion models have emerged as powerful tools for generating high-quality images from textual descriptions. However, their increasing popularity has raised significant copyright concerns, as these models can be misused to reproduce copyrighted content without authorization. In response, recent studies have proposed various copyright protection methods, including adversarial perturb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13144v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13144v1-abstract-full" style="display: none;"> Text-to-image diffusion models have emerged as powerful tools for generating high-quality images from textual descriptions. However, their increasing popularity has raised significant copyright concerns, as these models can be misused to reproduce copyrighted content without authorization. In response, recent studies have proposed various copyright protection methods, including adversarial perturbation, concept erasure, and watermarking techniques. However, their effectiveness and robustness against advanced attacks remain largely unexplored. Moreover, the lack of unified evaluation frameworks has hindered systematic comparison and fair assessment of different approaches. To bridge this gap, we systematize existing copyright protection methods and attacks, providing a unified taxonomy of their design spaces. We then develop CopyrightMeter, a unified evaluation framework that incorporates 17 state-of-the-art protections and 16 representative attacks. Leveraging CopyrightMeter, we comprehensively evaluate protection methods across multiple dimensions, thereby uncovering how different design choices impact fidelity, efficacy, and resilience under attacks. Our analysis reveals several key findings: (i) most protections (16/17) are not resilient against attacks; (ii) the "best" protection varies depending on the target priority; (iii) more advanced attacks significantly promote the upgrading of protections. These insights provide concrete guidance for developing more robust protection methods, while its unified evaluation protocol establishes a standard benchmark for future copyright protection research in text-to-image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13144v1-abstract-full').style.display = 'none'; document.getElementById('2411.13144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11394">arXiv:2411.11394</a> <span> [<a href="https://arxiv.org/pdf/2411.11394">pdf</a>, <a href="https://arxiv.org/format/2411.11394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> InstruGen: Automatic Instruction Generation for Vision-and-Language Navigation Via Large Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yu Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rongtao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiazhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peiyang Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11394v1-abstract-short" style="display: inline;"> Recent research on Vision-and-Language Navigation (VLN) indicates that agents suffer from poor generalization in unseen environments due to the lack of realistic training environments and high-quality path-instruction pairs. Most existing methods for constructing realistic navigation scenes have high costs, and the extension of instructions mainly relies on predefined templates or rules, lacking a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11394v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11394v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11394v1-abstract-full" style="display: none;"> Recent research on Vision-and-Language Navigation (VLN) indicates that agents suffer from poor generalization in unseen environments due to the lack of realistic training environments and high-quality path-instruction pairs. Most existing methods for constructing realistic navigation scenes have high costs, and the extension of instructions mainly relies on predefined templates or rules, lacking adaptability. To alleviate the issue, we propose InstruGen, a VLN path-instruction pairs generation paradigm. Specifically, we use YouTube house tour videos as realistic navigation scenes and leverage the powerful visual understanding and generation abilities of large multimodal models (LMMs) to automatically generate diverse and high-quality VLN path-instruction pairs. Our method generates navigation instructions with different granularities and achieves fine-grained alignment between instructions and visual observations, which was difficult to achieve with previous methods. Additionally, we design a multi-stage verification mechanism to reduce hallucinations and inconsistency of LMMs. Experimental results demonstrate that agents trained with path-instruction pairs generated by InstruGen achieves state-of-the-art performance on the R2R and RxR benchmarks, particularly in unseen environments. Code is available at https://github.com/yanyu0526/InstruGen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11394v1-abstract-full').style.display = 'none'; document.getElementById('2411.11394v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07725">arXiv:2411.07725</a> <span> [<a href="https://arxiv.org/pdf/2411.07725">pdf</a>, <a href="https://arxiv.org/format/2411.07725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ALOcc: Adaptive Lifting-based 3D Semantic Occupancy and Cost Volume-based Flow Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dubing Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jin Fang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wencheng Han</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinjing Cheng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junbo Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenzhong Xu</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+F+S">Fahad Shahbaz Khan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jianbing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07725v1-abstract-short" style="display: inline;"> Vision-based semantic occupancy and flow prediction plays a crucial role in providing spatiotemporal cues for real-world tasks, such as autonomous driving. Existing methods prioritize higher accuracy to cater to the demands of these tasks. In this work, we strive to improve performance by introducing a series of targeted improvements for 3D semantic occupancy prediction and flow estimation. First,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07725v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07725v1-abstract-full" style="display: none;"> Vision-based semantic occupancy and flow prediction plays a crucial role in providing spatiotemporal cues for real-world tasks, such as autonomous driving. Existing methods prioritize higher accuracy to cater to the demands of these tasks. In this work, we strive to improve performance by introducing a series of targeted improvements for 3D semantic occupancy prediction and flow estimation. First, we introduce an occlusion-aware adaptive lifting mechanism with a depth denoising technique to improve the robustness of 2D-to-3D feature transformation and reduce the reliance on depth priors. Second, we strengthen the semantic consistency between 3D features and their original 2D modalities by utilizing shared semantic prototypes to jointly constrain both 2D and 3D features. This is complemented by confidence- and category-based sampling strategies to tackle long-tail challenges in 3D space. To alleviate the feature encoding burden in the joint prediction of semantics and flow, we propose a BEV cost volume-based prediction method that links flow and semantic features through a cost volume and employs a classification-regression supervision scheme to address the varying flow scales in dynamic scenes. Our purely convolutional architecture framework, named ALOcc, achieves an optimal tradeoff between speed and accuracy achieving state-of-the-art results on multiple benchmarks. On Occ3D and training without the camera visible mask, our ALOcc achieves an absolute gain of 2.5\% in terms of RayIoU while operating at a comparable speed compared to the state-of-the-art, using the same input size (256$\times$704) and ResNet-50 backbone. Our method also achieves 2nd place in the CVPR24 Occupancy and Flow Prediction Competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07725v1-abstract-full').style.display = 'none'; document.getElementById('2411.07725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07688">arXiv:2411.07688</a> <span> [<a href="https://arxiv.org/pdf/2411.07688">pdf</a>, <a href="https://arxiv.org/format/2411.07688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Ultra High Resolution Remote Sensing Imagery Analysis with ImageRAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zilun Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Haozhan Shen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+T">Tiancheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuhao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuxiang Cai</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yongheng Shang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07688v1-abstract-short" style="display: inline;"> Ultra High Resolution (UHR) remote sensing imagery (RSI) (e.g. 100,000 $\times$ 100,000 pixels or more) poses a significant challenge for current Remote Sensing Multimodal Large Language Models (RSMLLMs). If choose to resize the UHR image to standard input image size, the extensive spatial and contextual information that UHR images contain will be neglected. Otherwise, the original size of these i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07688v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07688v1-abstract-full" style="display: none;"> Ultra High Resolution (UHR) remote sensing imagery (RSI) (e.g. 100,000 $\times$ 100,000 pixels or more) poses a significant challenge for current Remote Sensing Multimodal Large Language Models (RSMLLMs). If choose to resize the UHR image to standard input image size, the extensive spatial and contextual information that UHR images contain will be neglected. Otherwise, the original size of these images often exceeds the token limits of standard RSMLLMs, making it difficult to process the entire image and capture long-range dependencies to answer the query based on the abundant visual context. In this paper, we introduce ImageRAG for RS, a training-free framework to address the complexities of analyzing UHR remote sensing imagery. By transforming UHR remote sensing image analysis task to image's long context selection task, we design an innovative image contextual retrieval mechanism based on the Retrieval-Augmented Generation (RAG) technique, denoted as ImageRAG. ImageRAG's core innovation lies in its ability to selectively retrieve and focus on the most relevant portions of the UHR image as visual contexts that pertain to a given query. Fast path and slow path are proposed in this framework to handle this task efficiently and effectively. ImageRAG allows RSMLLMs to manage extensive context and spatial information from UHR RSI, ensuring the analysis is both accurate and efficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07688v1-abstract-full').style.display = 'none'; document.getElementById('2411.07688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07510">arXiv:2411.07510</a> <span> [<a href="https://arxiv.org/pdf/2411.07510">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> An Attack Traffic Identification Method Based on Temporal Spectrum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wenwei Xie</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jie Yin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zihao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07510v1-abstract-short" style="display: inline;"> To address the issues of insufficient robustness, unstable features, and data noise interference in existing network attack detection and identification models, this paper proposes an attack traffic detection and identification method based on temporal spectrum. First, traffic data is segmented by a sliding window to construct a feature sequence and a corresponding label sequence for network traff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07510v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07510v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07510v1-abstract-full" style="display: none;"> To address the issues of insufficient robustness, unstable features, and data noise interference in existing network attack detection and identification models, this paper proposes an attack traffic detection and identification method based on temporal spectrum. First, traffic data is segmented by a sliding window to construct a feature sequence and a corresponding label sequence for network traffic. Next, the proposed spectral label generation methods, SSPE and COAP, are applied to transform the label sequence into spectral labels and the feature sequence into temporal features. Spectral labels and temporal features are used to capture and represent behavioral patterns of attacks. Finally, the constructed temporal features and spectral labels are used to train models, which subsequently detects and identifies network attack behaviors. Experimental results demonstrate that compared to traditional methods, models trained with the SSPE or COAP method improve identification accuracy by 10%, and exhibit strong robustness, particularly in noisy environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07510v1-abstract-full').style.display = 'none'; document.getElementById('2411.07510v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures, 7 tables, 8 formulas</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06881">arXiv:2411.06881</a> <span> [<a href="https://arxiv.org/pdf/2411.06881">pdf</a>, <a href="https://arxiv.org/format/2411.06881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> WassFFed: Wasserstein Fair Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhongxuan Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chaochao Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaolin Zheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+F">Fei Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuyuan Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06881v1-abstract-short" style="display: inline;"> Federated Learning (FL) employs a training approach to address scenarios where users' data cannot be shared across clients. Achieving fairness in FL is imperative since training data in FL is inherently geographically distributed among diverse user groups. Existing research on fairness predominantly assumes access to the entire training data, making direct transfer to FL challenging. However, the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06881v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06881v1-abstract-full" style="display: none;"> Federated Learning (FL) employs a training approach to address scenarios where users' data cannot be shared across clients. Achieving fairness in FL is imperative since training data in FL is inherently geographically distributed among diverse user groups. Existing research on fairness predominantly assumes access to the entire training data, making direct transfer to FL challenging. However, the limited existing research on fairness in FL does not effectively address two key challenges, i.e., (CH1) Current methods fail to deal with the inconsistency between fair optimization results obtained with surrogate functions and fair classification results. (CH2) Directly aggregating local fair models does not always yield a globally fair model due to non Identical and Independent data Distributions (non-IID) among clients. To address these challenges, we propose a Wasserstein Fair Federated Learning framework, namely WassFFed. To tackle CH1, we ensure that the outputs of local models, rather than the loss calculated with surrogate functions or classification results with a threshold, remain independent of various user groups. To resolve CH2, we employ a Wasserstein barycenter calculation of all local models' outputs for each user group, bringing local model outputs closer to the global output distribution to ensure consistency between the global model and local models. We conduct extensive experiments on three real-world datasets, demonstrating that WassFFed outperforms existing approaches in striking a balance between accuracy and fairness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06881v1-abstract-full').style.display = 'none'; document.getElementById('2411.06881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to TKDE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22832">arXiv:2410.22832</a> <span> [<a href="https://arxiv.org/pdf/2410.22832">pdf</a>, <a href="https://arxiv.org/format/2410.22832">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> HijackRAG: Hijacking Attacks against Retrieval-Augmented Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yucheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinfeng Li</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Zhengwen Feng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22832v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) systems enhance large language models (LLMs) by integrating external knowledge, making them adaptable and cost-effective for various applications. However, the growing reliance on these systems also introduces potential security risks. In this work, we reveal a novel vulnerability, the retrieval prompt hijack attack (HijackRAG), which enables attackers to manip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22832v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22832v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22832v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) systems enhance large language models (LLMs) by integrating external knowledge, making them adaptable and cost-effective for various applications. However, the growing reliance on these systems also introduces potential security risks. In this work, we reveal a novel vulnerability, the retrieval prompt hijack attack (HijackRAG), which enables attackers to manipulate the retrieval mechanisms of RAG systems by injecting malicious texts into the knowledge database. When the RAG system encounters target questions, it generates the attacker's pre-determined answers instead of the correct ones, undermining the integrity and trustworthiness of the system. We formalize HijackRAG as an optimization problem and propose both black-box and white-box attack strategies tailored to different levels of the attacker's knowledge. Extensive experiments on multiple benchmark datasets show that HijackRAG consistently achieves high attack success rates, outperforming existing baseline attacks. Furthermore, we demonstrate that the attack is transferable across different retriever models, underscoring the widespread risk it poses to RAG systems. Lastly, our exploration of various defense mechanisms reveals that they are insufficient to counter HijackRAG, emphasizing the urgent need for more robust security measures to protect RAG systems in real-world deployments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22832v1-abstract-full').style.display = 'none'; document.getElementById('2410.22832v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16208">arXiv:2410.16208</a> <span> [<a href="https://arxiv.org/pdf/2410.16208">pdf</a>, <a href="https://arxiv.org/format/2410.16208">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Compute-Constrained Data Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+J+O">Junjie Oscar Yin</a>, <a href="/search/cs?searchtype=author&query=Rush%2C+A+M">Alexander M. Rush</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16208v3-abstract-short" style="display: inline;"> Data selection can reduce the amount of training data needed to finetune LLMs; however, the efficacy of data selection scales directly with its compute. Motivated by the practical challenge of compute-constrained finetuning, we consider the setting in which both the cost of selecting data and training are budgeted for. We first formalize the problem of data selection with a cost-aware utility func… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16208v3-abstract-full').style.display = 'inline'; document.getElementById('2410.16208v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16208v3-abstract-full" style="display: none;"> Data selection can reduce the amount of training data needed to finetune LLMs; however, the efficacy of data selection scales directly with its compute. Motivated by the practical challenge of compute-constrained finetuning, we consider the setting in which both the cost of selecting data and training are budgeted for. We first formalize the problem of data selection with a cost-aware utility function, and model the data selection problem as trading off initial-selection cost for training gain. We run a comprehensive sweep of experiments across multiple tasks, varying compute budget by scaling finetuning tokens, model sizes, and data selection compute. Interestingly we find that many powerful data selection methods are almost never compute-optimal, and that cheaper data selection alternatives dominate both from a theoretical and empirical perspective. For compute-optimal training, we find that perplexity and gradient data selection require training-to-selection model size ratios of 5x and 10x, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16208v3-abstract-full').style.display = 'none'; document.getElementById('2410.16208v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13903">arXiv:2410.13903</a> <span> [<a href="https://arxiv.org/pdf/2410.13903">pdf</a>, <a href="https://arxiv.org/format/2410.13903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> CoreGuard: Safeguarding Foundational Capabilities of LLMs Against Model Stealing in Edge Deployment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qinfeng Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yangfan Xie</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiqiang Shen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhenghan Qin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xianwei Zhu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13903v1-abstract-short" style="display: inline;"> Proprietary large language models (LLMs) demonstrate exceptional generalization ability across various tasks. Additionally, deploying LLMs on edge devices is trending for efficiency and privacy reasons. However, edge deployment of proprietary LLMs introduces new security threats: attackers who obtain an edge-deployed LLM can easily use it as a base model for various tasks due to its high generaliz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13903v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13903v1-abstract-full" style="display: none;"> Proprietary large language models (LLMs) demonstrate exceptional generalization ability across various tasks. Additionally, deploying LLMs on edge devices is trending for efficiency and privacy reasons. However, edge deployment of proprietary LLMs introduces new security threats: attackers who obtain an edge-deployed LLM can easily use it as a base model for various tasks due to its high generalization ability, which we call foundational capability stealing. Unfortunately, existing model protection mechanisms are often task-specific and fail to protect general-purpose LLMs, as they mainly focus on protecting task-related parameters using trusted execution environments (TEEs). Although some recent TEE-based methods are able to protect the overall model parameters in a computation-efficient way, they still suffer from prohibitive communication costs between TEE and CPU/GPU, making it impractical to deploy for edge LLMs. To protect the foundational capabilities of edge LLMs, we propose CoreGuard, a computation- and communication-efficient model protection approach against model stealing on edge devices. The core component of CoreGuard is a lightweight and propagative authorization module residing in TEE. Extensive experiments show that CoreGuard achieves the same security protection as the black-box security guarantees with negligible overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13903v1-abstract-full').style.display = 'none'; document.getElementById('2410.13903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11157">arXiv:2410.11157</a> <span> [<a href="https://arxiv.org/pdf/2410.11157">pdf</a>, <a href="https://arxiv.org/format/2410.11157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RPCBF: Constructing Safety Filters Robust to Model Error and Disturbances via Policy Control Barrier Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Knoedler%2C+L">Luzia Knoedler</a>, <a href="/search/cs?searchtype=author&query=So%2C+O">Oswin So</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Ji Yin</a>, <a href="/search/cs?searchtype=author&query=Black%2C+M">Mitchell Black</a>, <a href="/search/cs?searchtype=author&query=Serlin%2C+Z">Zachary Serlin</a>, <a href="/search/cs?searchtype=author&query=Tsiotras%2C+P">Panagiotis Tsiotras</a>, <a href="/search/cs?searchtype=author&query=Alonso-Mora%2C+J">Javier Alonso-Mora</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Chuchu Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11157v2-abstract-short" style="display: inline;"> Control Barrier Functions (CBFs) have proven to be an effective tool for performing safe control synthesis for nonlinear systems. However, guaranteeing safety in the presence of disturbances and input constraints for high relative degree systems is a difficult problem. In this work, we propose the Robust Policy CBF (RPCBF), a practical method of constructing CBF approximations that is easy to impl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11157v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11157v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11157v2-abstract-full" style="display: none;"> Control Barrier Functions (CBFs) have proven to be an effective tool for performing safe control synthesis for nonlinear systems. However, guaranteeing safety in the presence of disturbances and input constraints for high relative degree systems is a difficult problem. In this work, we propose the Robust Policy CBF (RPCBF), a practical method of constructing CBF approximations that is easy to implement and robust to disturbances via the estimation of a value function. We demonstrate the effectiveness of our method in simulation on a variety of high relative degree input-constrained systems. Finally, we demonstrate the benefits of RPCBF in compensating for model errors on a hardware quadcopter platform by treating the model errors as disturbances. The project page can be found at https://oswinso.xyz/rpcbf. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11157v2-abstract-full').style.display = 'none'; document.getElementById('2410.11157v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICRA 2025. The project page can be found at https://oswinso.xyz/rpcbf</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09772">arXiv:2410.09772</a> <span> [<a href="https://arxiv.org/pdf/2410.09772">pdf</a>, <a href="https://arxiv.org/format/2410.09772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HypomimiaCoach: An AU-based Digital Therapy System for Hypomimia Detection & Rehabilitation with Parkinson's Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yingjing Xu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xueyan Cai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihong Zhou</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mengru Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haotian Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengke Li</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+C">Chentian Weng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wei Luo</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+C">Cheng Yao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09772v1-abstract-short" style="display: inline;"> Hypomimia is a non-motor symptom of Parkinson's disease that manifests as delayed facial movements and expressions, along with challenges in articulation and emotion. Currently, subjective evaluation by neurologists is the primary method for hypomimia detection, and conventional rehabilitation approaches heavily rely on verbal prompts from rehabilitation physicians. There remains a deficiency in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09772v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09772v1-abstract-full" style="display: none;"> Hypomimia is a non-motor symptom of Parkinson's disease that manifests as delayed facial movements and expressions, along with challenges in articulation and emotion. Currently, subjective evaluation by neurologists is the primary method for hypomimia detection, and conventional rehabilitation approaches heavily rely on verbal prompts from rehabilitation physicians. There remains a deficiency in accessible, user-friendly and scientifically rigorous assistive tools for hypomimia treatments. To investigate this, we developed HypomimaCoach, an Action Unit (AU)-based digital therapy system for hypomimia detection and rehabilitation in Parkinson's disease. The HypomimaCoach system was designed to facilitate engagement through the incorporation of both relaxed and controlled rehabilitation exercises, while also stimulating initiative through the integration of digital therapies that incorporated traditional face training methods. We extract action unit(AU) features and their relationship for hypomimia detection. In order to facilitate rehabilitation, a series of training programmes have been devised based on the Action Units (AUs) and patients are provided with real-time feedback through an additional AU recognition model, which guides them through their training routines. A pilot study was conducted with seven participants in China, all of whom exhibited symptoms of Parkinson's disease hypomimia. The results of the pilot study demonstrated a positive impact on participants' self-efficacy, with favourable feedback received. Furthermore, physician evaluations validated the system's applicability in a therapeutic setting for patients with Parkinson's disease, as well as its potential value in clinical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09772v1-abstract-full').style.display = 'none'; document.getElementById('2410.09772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09508">arXiv:2410.09508</a> <span> [<a href="https://arxiv.org/pdf/2410.09508">pdf</a>, <a href="https://arxiv.org/format/2410.09508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> CollabEdit: Towards Non-destructive Collaborative Knowledge Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiamu Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinghuai Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tao Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09508v3-abstract-short" style="display: inline;"> Collaborative learning of large language models (LLMs) has emerged as a new paradigm for utilizing private data from different parties to guarantee efficiency and privacy. Meanwhile, Knowledge Editing (KE) for LLMs has also garnered increased attention due to its ability to manipulate the behaviors of LLMs explicitly, yet leaves the collaborative KE case (in which knowledge edits of multiple parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09508v3-abstract-full').style.display = 'inline'; document.getElementById('2410.09508v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09508v3-abstract-full" style="display: none;"> Collaborative learning of large language models (LLMs) has emerged as a new paradigm for utilizing private data from different parties to guarantee efficiency and privacy. Meanwhile, Knowledge Editing (KE) for LLMs has also garnered increased attention due to its ability to manipulate the behaviors of LLMs explicitly, yet leaves the collaborative KE case (in which knowledge edits of multiple parties are aggregated in a privacy-preserving and continual manner) unexamined. To this end, this manuscript dives into the first investigation of collaborative KE, in which we start by carefully identifying the unique three challenges therein, including knowledge overlap, knowledge conflict, and knowledge forgetting. We then propose a non-destructive collaborative KE framework, COLLABEDIT, which employs a novel model merging mechanism to mimic the global KE behavior while preventing the severe performance drop. Extensive experiments on two canonical datasets demonstrate the superiority of COLLABEDIT compared to other destructive baselines, and results shed light on addressing three collaborative KE challenges and future applications. Our code is available at https://github.com/LINs-lab/CollabEdit. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09508v3-abstract-full').style.display = 'none'; document.getElementById('2410.09508v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 11 figures. Published as a conference paper at ICLR 2025. Code at https://github.com/LINs-lab/CollabEdit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09132">arXiv:2410.09132</a> <span> [<a href="https://arxiv.org/pdf/2410.09132">pdf</a>, <a href="https://arxiv.org/format/2410.09132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> When Graph meets Multimodal: Benchmarking on Multimodal Attributed Graphs Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhigang Yu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruochen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weihao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingzheng Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhengxin Zeng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Weiwei Deng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Feng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senzhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09132v1-abstract-short" style="display: inline;"> Multimodal attributed graphs (MAGs) are prevalent in various real-world scenarios and generally contain two kinds of knowledge: (a) Attribute knowledge is mainly supported by the attributes of different modalities contained in nodes (entities) themselves, such as texts and images. (b) Topology knowledge, on the other hand, is provided by the complex interactions posed between nodes. The cornerston… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09132v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09132v1-abstract-full" style="display: none;"> Multimodal attributed graphs (MAGs) are prevalent in various real-world scenarios and generally contain two kinds of knowledge: (a) Attribute knowledge is mainly supported by the attributes of different modalities contained in nodes (entities) themselves, such as texts and images. (b) Topology knowledge, on the other hand, is provided by the complex interactions posed between nodes. The cornerstone of MAG representation learning lies in the seamless integration of multimodal attributes and topology. Recent advancements in Pre-trained Language/Vision models (PLMs/PVMs) and Graph neural networks (GNNs) have facilitated effective learning on MAGs, garnering increased research interest. However, the absence of meaningful benchmark datasets and standardized evaluation procedures for MAG representation learning has impeded progress in this field. In this paper, we propose Multimodal Attribute Graph Benchmark (MAGB)}, a comprehensive and diverse collection of challenging benchmark datasets for MAGs. The MAGB datasets are notably large in scale and encompass a wide range of domains, spanning from e-commerce networks to social networks. In addition to the brand-new datasets, we conduct extensive benchmark experiments over MAGB with various learning paradigms, ranging from GNN-based and PLM-based methods, to explore the necessity and feasibility of integrating multimodal attributes and graph topology. In a nutshell, we provide an overview of the MAG datasets, standardized evaluation procedures, and present baseline experiments. The entire MAGB project is publicly accessible at https://github.com/sktsherlock/ATG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09132v1-abstract-full').style.display = 'none'; document.getElementById('2410.09132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01671">arXiv:2410.01671</a> <span> [<a href="https://arxiv.org/pdf/2410.01671">pdf</a>, <a href="https://arxiv.org/format/2410.01671">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Bridging Context Gaps: Leveraging Coreference Resolution for Long Contextual Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yanming Liu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xinyue Peng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiannan Cao</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+S">Shi Bo</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yanxin Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sheng Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01671v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable capabilities in natural language processing; however, they still face difficulties when tasked with understanding lengthy contexts and executing effective question answering. These challenges often arise due to the complexity and ambiguity present in longer texts. To enhance the performance of LLMs in such scenarios, we introduce the Long Question… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01671v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01671v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01671v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable capabilities in natural language processing; however, they still face difficulties when tasked with understanding lengthy contexts and executing effective question answering. These challenges often arise due to the complexity and ambiguity present in longer texts. To enhance the performance of LLMs in such scenarios, we introduce the Long Question Coreference Adaptation (LQCA) method. This innovative framework focuses on coreference resolution tailored to long contexts, allowing the model to identify and manage references effectively. The LQCA method encompasses four key steps: resolving coreferences within sub-documents, computing the distances between mentions, defining a representative mention for coreference, and answering questions through mention replacement. By processing information systematically, the framework provides easier-to-handle partitions for LLMs, promoting better understanding. Experimental evaluations on a range of LLMs and datasets have yielded positive results, with a notable improvements on OpenAI-o1-mini and GPT-4o models, highlighting the effectiveness of leveraging coreference resolution to bridge context gaps in question answering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01671v1-abstract-full').style.display = 'none'; document.getElementById('2410.01671v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Underreview version of LQCA, Bridge context gap for long context</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01488">arXiv:2410.01488</a> <span> [<a href="https://arxiv.org/pdf/2410.01488">pdf</a>, <a href="https://arxiv.org/format/2410.01488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> SecCoder: Towards Generalizable and Robust Secure Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+T">Tianyu Du</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+J">Junkai Tong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+K">Kingsum Chow</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S">Sheng Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01488v1-abstract-short" style="display: inline;"> After large models (LMs) have gained widespread acceptance in code-related tasks, their superior generative capacity has greatly promoted the application of the code LM. Nevertheless, the security of the generated code has raised attention to its potential damage. Existing secure code generation methods have limited generalizability to unseen test cases and poor robustness against the attacked mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01488v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01488v1-abstract-full" style="display: none;"> After large models (LMs) have gained widespread acceptance in code-related tasks, their superior generative capacity has greatly promoted the application of the code LM. Nevertheless, the security of the generated code has raised attention to its potential damage. Existing secure code generation methods have limited generalizability to unseen test cases and poor robustness against the attacked model, leading to safety failures in code generation. In this paper, we propose a generalizable and robust secure code generation method SecCoder by using in-context learning (ICL) and the safe demonstration. The dense retriever is also used to select the most helpful demonstration to maximize the improvement of the generated code's security. Experimental results show the superior generalizability of the proposed model SecCoder compared to the current secure code generation method, achieving a significant security improvement of an average of 7.20% on unseen test cases. The results also show the better robustness of SecCoder compared to the current attacked code LM, achieving a significant security improvement of an average of 7.74%. Our analysis indicates that SecCoder enhances the security of LMs in generating code, and it is more generalizable and robust. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01488v1-abstract-full').style.display = 'none'; document.getElementById('2410.01488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To Appear in the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20154">arXiv:2409.20154</a> <span> [<a href="https://arxiv.org/pdf/2409.20154">pdf</a>, <a href="https://arxiv.org/format/2409.20154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> GravMAD: Grounded Spatial Value Maps Guided Action Diffusion for Generalized 3D Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yangtao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junhui Yin</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+J">Jing Huo</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+P">Pinzhuo Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jieqi Shi</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yang Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20154v4-abstract-short" style="display: inline;"> Robots' ability to follow language instructions and execute diverse 3D manipulation tasks is vital in robot learning. Traditional imitation learning-based methods perform well on seen tasks but struggle with novel, unseen ones due to variability. Recent approaches leverage large foundation models to assist in understanding novel tasks, thereby mitigating this issue. However, these methods lack a t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20154v4-abstract-full').style.display = 'inline'; document.getElementById('2409.20154v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20154v4-abstract-full" style="display: none;"> Robots' ability to follow language instructions and execute diverse 3D manipulation tasks is vital in robot learning. Traditional imitation learning-based methods perform well on seen tasks but struggle with novel, unseen ones due to variability. Recent approaches leverage large foundation models to assist in understanding novel tasks, thereby mitigating this issue. However, these methods lack a task-specific learning process, which is essential for an accurate understanding of 3D environments, often leading to execution failures. In this paper, we introduce GravMAD, a sub-goal-driven, language-conditioned action diffusion framework that combines the strengths of imitation learning and foundation models. Our approach breaks tasks into sub-goals based on language instructions, allowing auxiliary guidance during both training and inference. During training, we introduce Sub-goal Keypose Discovery to identify key sub-goals from demonstrations. Inference differs from training, as there are no demonstrations available, so we use pre-trained foundation models to bridge the gap and identify sub-goals for the current task. In both phases, GravMaps are generated from sub-goals, providing GravMAD with more flexible 3D spatial guidance compared to fixed 3D positions. Empirical evaluations on RLBench show that GravMAD significantly outperforms state-of-the-art methods, with a 28.63% improvement on novel tasks and a 13.36% gain on tasks encountered during training. Evaluations on real-world robotic tasks further show that GravMAD can reason about real-world tasks, associate them with relevant visual information, and generalize to novel tasks. These results demonstrate GravMAD's strong multi-task learning and generalization in 3D manipulation. Video demonstrations are available at: https://gravmad.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20154v4-abstract-full').style.display = 'none'; document.getElementById('2409.20154v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025. The first two authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17424">arXiv:2409.17424</a> <span> [<a href="https://arxiv.org/pdf/2409.17424">pdf</a>, <a href="https://arxiv.org/format/2409.17424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Results of the Big ANN: NeurIPS'23 competition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Simhadri%2C+H+V">Harsha Vardhan Simhadri</a>, <a href="/search/cs?searchtype=author&query=Aum%C3%BCller%2C+M">Martin Aum眉ller</a>, <a href="/search/cs?searchtype=author&query=Ingber%2C+A">Amir Ingber</a>, <a href="/search/cs?searchtype=author&query=Douze%2C+M">Matthijs Douze</a>, <a href="/search/cs?searchtype=author&query=Williams%2C+G">George Williams</a>, <a href="/search/cs?searchtype=author&query=Manohar%2C+M+D">Magdalen Dobson Manohar</a>, <a href="/search/cs?searchtype=author&query=Baranchuk%2C+D">Dmitry Baranchuk</a>, <a href="/search/cs?searchtype=author&query=Liberty%2C+E">Edo Liberty</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Frank Liu</a>, <a href="/search/cs?searchtype=author&query=Landrum%2C+B">Ben Landrum</a>, <a href="/search/cs?searchtype=author&query=Karjikar%2C+M">Mazin Karjikar</a>, <a href="/search/cs?searchtype=author&query=Dhulipala%2C+L">Laxman Dhulipala</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+R">Rui Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuzheng Cai</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiayang Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yizhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Weiguo Zheng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zihao Wan</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jie Yin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Ben Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17424v1-abstract-short" style="display: inline;"> The 2023 Big ANN Challenge, held at NeurIPS 2023, focused on advancing the state-of-the-art in indexing data structures and search algorithms for practical variants of Approximate Nearest Neighbor (ANN) search that reflect the growing complexity and diversity of workloads. Unlike prior challenges that emphasized scaling up classical ANN search ~\cite{DBLP:conf/nips/SimhadriWADBBCH21}, this competi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17424v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17424v1-abstract-full" style="display: none;"> The 2023 Big ANN Challenge, held at NeurIPS 2023, focused on advancing the state-of-the-art in indexing data structures and search algorithms for practical variants of Approximate Nearest Neighbor (ANN) search that reflect the growing complexity and diversity of workloads. Unlike prior challenges that emphasized scaling up classical ANN search ~\cite{DBLP:conf/nips/SimhadriWADBBCH21}, this competition addressed filtered search, out-of-distribution data, sparse and streaming variants of ANNS. Participants developed and submitted innovative solutions that were evaluated on new standard datasets with constrained computational resources. The results showcased significant improvements in search accuracy and efficiency over industry-standard baselines, with notable contributions from both academic and industrial teams. This paper summarizes the competition tracks, datasets, evaluation metrics, and the innovative approaches of the top-performing submissions, providing insights into the current advancements and future directions in the field of approximate nearest neighbor search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17424v1-abstract-full').style.display = 'none'; document.getElementById('2409.17424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/harsha-simhadri/big-ann-benchmarks/releases/tag/v0.3.0</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.3.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12139">arXiv:2409.12139</a> <span> [<a href="https://arxiv.org/pdf/2409.12139">pdf</a>, <a href="https://arxiv.org/format/2409.12139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Takin: A Cohort of Superior Quality Zero-shot Speech Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sijing Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuan Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Laipeng He</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tianwei He</a>, <a href="/search/cs?searchtype=author&query=He%2C+W">Wendi He</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yanni Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiting Lin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+P">Pengfei Tan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+C">Chengwei Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruoye Xie</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Q">Quanlei Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jingjing Yin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yanzhen Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Huimin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Guangcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+P">Pengpeng Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12139v3-abstract-short" style="display: inline;"> With the advent of the big data and large language model era, zero-shot personalized rapid customization has emerged as a significant trend. In this report, we introduce Takin AudioLLM, a series of techniques and models, mainly including Takin TTS, Takin VC, and Takin Morphing, specifically designed for audiobook production. These models are capable of zero-shot speech production, generating high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12139v3-abstract-full').style.display = 'inline'; document.getElementById('2409.12139v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12139v3-abstract-full" style="display: none;"> With the advent of the big data and large language model era, zero-shot personalized rapid customization has emerged as a significant trend. In this report, we introduce Takin AudioLLM, a series of techniques and models, mainly including Takin TTS, Takin VC, and Takin Morphing, specifically designed for audiobook production. These models are capable of zero-shot speech production, generating high-quality speech that is nearly indistinguishable from real human speech and facilitating individuals to customize the speech content according to their own needs. Specifically, we first introduce Takin TTS, a neural codec language model that builds upon an enhanced neural speech codec and a multi-task training framework, capable of generating high-fidelity natural speech in a zero-shot way. For Takin VC, we advocate an effective content and timbre joint modeling approach to improve the speaker similarity, while advocating for a conditional flow matching based decoder to further enhance its naturalness and expressiveness. Last, we propose the Takin Morphing system with highly decoupled and advanced timbre and prosody modeling approaches, which enables individuals to customize speech production with their preferred timbre and prosody in a precise and controllable manner. Extensive experiments validate the effectiveness and robustness of our Takin AudioLLM series models. For detailed demos, please refer to https://everest-ai.github.io/takinaudiollm/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12139v3-abstract-full').style.display = 'none'; document.getElementById('2409.12139v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report; 18 pages; typos corrected, references added, demo url modified, author name modified;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10071">arXiv:2409.10071</a> <span> [<a href="https://arxiv.org/pdf/2409.10071">pdf</a>, <a href="https://arxiv.org/format/2409.10071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Physically-Realizable Adversarial Attacks in Embodied Vision Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meng Chen</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jiawei Tu</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+C">Chao Qi</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+Y">Yonghao Dang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wei Wei</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10071v3-abstract-short" style="display: inline;"> The deployment of embodied navigation agents in safety-critical environments raises concerns about their vulnerability to adversarial attacks on deep neural networks. However, current attack methods often lack practicality due to challenges in transitioning from the digital to the physical world, while existing physical attacks for object detection fail to achieve both multi-view effectiveness and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10071v3-abstract-full').style.display = 'inline'; document.getElementById('2409.10071v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10071v3-abstract-full" style="display: none;"> The deployment of embodied navigation agents in safety-critical environments raises concerns about their vulnerability to adversarial attacks on deep neural networks. However, current attack methods often lack practicality due to challenges in transitioning from the digital to the physical world, while existing physical attacks for object detection fail to achieve both multi-view effectiveness and naturalness. To address this, we propose a practical attack method for embodied navigation by attaching adversarial patches with learnable textures and opacity to objects. Specifically, to ensure effectiveness across varying viewpoints, we employ a multi-view optimization strategy based on object-aware sampling, which uses feedback from the navigation model to optimize the patch's texture. To make the patch inconspicuous to human observers, we introduce a two-stage opacity optimization mechanism, where opacity is refined after texture optimization. Experimental results show our adversarial patches reduce navigation success rates by about 40%, outperforming previous methods in practicality, effectiveness, and naturalness. Code is available at: [https://github.com/chen37058/Physical-Attacks-in-Embodied-Navigation]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10071v3-abstract-full').style.display = 'none'; document.getElementById('2409.10071v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, submitted to the 2025 IEEE International Conference on Robotics & Automation (ICRA)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09253">arXiv:2409.09253</a> <span> [<a href="https://arxiv.org/pdf/2409.09253">pdf</a>, <a href="https://arxiv.org/format/2409.09253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unleash LLMs Potential for Recommendation by Coordinating Twin-Tower Dynamic Semantic Token Generator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhengxin Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingzheng Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weihao Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianjin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruochen Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+A">Allen Sun</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+D">Denvy Deng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Feng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senzhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09253v1-abstract-short" style="display: inline;"> Owing to the unprecedented capability in semantic understanding and logical reasoning, the pre-trained large language models (LLMs) have shown fantastic potential in developing the next-generation recommender systems (RSs). However, the static index paradigm adopted by current methods greatly restricts the utilization of LLMs capacity for recommendation, leading to not only the insufficient alignm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09253v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09253v1-abstract-full" style="display: none;"> Owing to the unprecedented capability in semantic understanding and logical reasoning, the pre-trained large language models (LLMs) have shown fantastic potential in developing the next-generation recommender systems (RSs). However, the static index paradigm adopted by current methods greatly restricts the utilization of LLMs capacity for recommendation, leading to not only the insufficient alignment between semantic and collaborative knowledge, but also the neglect of high-order user-item interaction patterns. In this paper, we propose Twin-Tower Dynamic Semantic Recommender (TTDS), the first generative RS which adopts dynamic semantic index paradigm, targeting at resolving the above problems simultaneously. To be more specific, we for the first time contrive a dynamic knowledge fusion framework which integrates a twin-tower semantic token generator into the LLM-based recommender, hierarchically allocating meaningful semantic index for items and users, and accordingly predicting the semantic index of target item. Furthermore, a dual-modality variational auto-encoder is proposed to facilitate multi-grained alignment between semantic and collaborative knowledge. Eventually, a series of novel tuning tasks specially customized for capturing high-order user-item interaction patterns are proposed to take advantages of user historical behavior. Extensive experiments across three public datasets demonstrate the superiority of the proposed methodology in developing LLM-based generative RSs. The proposed TTDS recommender achieves an average improvement of 19.41% in Hit-Rate and 20.84% in NDCG metric, compared with the leading baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09253v1-abstract-full').style.display = 'none'; document.getElementById('2409.09253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07372">arXiv:2409.07372</a> <span> [<a href="https://arxiv.org/pdf/2409.07372">pdf</a>, <a href="https://arxiv.org/format/2409.07372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Awaking the Slides: A Tuning-free and Knowledge-regulated AI Tutoring System via Language Model Coordination </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang-Li%2C+D">Daniel Zhang-Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jifan Yu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J+L+J">Joy Lim Jia Yin</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+S">Shangqing Tu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+L">Linlu Gong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haohua Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huiqin Liu</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+L">Lei Hou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Juanzi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07372v1-abstract-short" style="display: inline;"> The vast pre-existing slides serve as rich and important materials to carry lecture knowledge. However, effectively leveraging lecture slides to serve students is difficult due to the multi-modal nature of slide content and the heterogeneous teaching actions. We study the problem of discovering effective designs that convert a slide into an interactive lecture. We develop Slide2Lecture, a tuning-f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07372v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07372v1-abstract-full" style="display: none;"> The vast pre-existing slides serve as rich and important materials to carry lecture knowledge. However, effectively leveraging lecture slides to serve students is difficult due to the multi-modal nature of slide content and the heterogeneous teaching actions. We study the problem of discovering effective designs that convert a slide into an interactive lecture. We develop Slide2Lecture, a tuning-free and knowledge-regulated intelligent tutoring system that can (1) effectively convert an input lecture slide into a structured teaching agenda consisting of a set of heterogeneous teaching actions; (2) create and manage an interactive lecture that generates responsive interactions catering to student learning demands while regulating the interactions to follow teaching actions. Slide2Lecture contains a complete pipeline for learners to obtain an interactive classroom experience to learn the slide. For teachers and developers, Slide2Lecture enables customization to cater to personalized demands. The evaluation rated by annotators and students shows that Slide2Lecture is effective in outperforming the remaining implementation. Slide2Lecture's online deployment has made more than 200K interaction with students in the 3K lecture sessions. We open source Slide2Lecture's implementation in https://anonymous.4open.science/r/slide2lecture-4210/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07372v1-abstract-full').style.display = 'none'; document.getElementById('2409.07372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06750">arXiv:2409.06750</a> <span> [<a href="https://arxiv.org/pdf/2409.06750">pdf</a>, <a href="https://arxiv.org/format/2409.06750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Can Agents Spontaneously Form a Society? Introducing a Novel Architecture for Generative Multi-Agents to Elicit Social Emergence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">J. Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">M. Jiang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+C">C. Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06750v2-abstract-short" style="display: inline;"> Generative agents have demonstrated impressive capabilities in specific tasks, but most of these frameworks focus on independent tasks and lack attention to social interactions. We introduce a generative agent architecture called ITCMA-S, which includes a basic framework for individual agents and a framework called LTRHA that supports social interactions among multi-agents. This architecture enabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06750v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06750v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06750v2-abstract-full" style="display: none;"> Generative agents have demonstrated impressive capabilities in specific tasks, but most of these frameworks focus on independent tasks and lack attention to social interactions. We introduce a generative agent architecture called ITCMA-S, which includes a basic framework for individual agents and a framework called LTRHA that supports social interactions among multi-agents. This architecture enables agents to identify and filter out behaviors that are detrimental to social interactions, guiding them to choose more favorable actions. We designed a sandbox environment to simulate the natural evolution of social relationships among multiple identity-less agents for experimental evaluation. The results showed that ITCMA-S performed well on multiple evaluation indicators, demonstrating its ability to actively explore the environment, recognize new agents, and acquire new information through continuous actions and dialogue. Observations show that as agents establish connections with each other, they spontaneously form cliques with internal hierarchies around a selected leader and organize collective activities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06750v2-abstract-full').style.display = 'none'; document.getElementById('2409.06750v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T42 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; J.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06196">arXiv:2409.06196</a> <span> [<a href="https://arxiv.org/pdf/2409.06196">pdf</a>, <a href="https://arxiv.org/format/2409.06196">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MTDA-HSED: Mutual-Assistance Tuning and Dual-Branch Aggregating for Heterogeneous Sound Event Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zehao Wang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+H">Haobo Yue</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+D">Da Mu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jin Tang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianqin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06196v2-abstract-short" style="display: inline;"> Sound Event Detection (SED) plays a vital role in comprehending and perceiving acoustic scenes. Previous methods have demonstrated impressive capabilities. However, they are deficient in learning features of complex scenes from heterogeneous dataset. In this paper, we introduce a novel dual-branch architecture named Mutual-Assistance Tuning and Dual-Branch Aggregating for Heterogeneous Sound Event… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06196v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06196v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06196v2-abstract-full" style="display: none;"> Sound Event Detection (SED) plays a vital role in comprehending and perceiving acoustic scenes. Previous methods have demonstrated impressive capabilities. However, they are deficient in learning features of complex scenes from heterogeneous dataset. In this paper, we introduce a novel dual-branch architecture named Mutual-Assistance Tuning and Dual-Branch Aggregating for Heterogeneous Sound Event Detection (MTDA-HSED). The MTDA-HSED architecture employs the Mutual-Assistance Audio Adapter (M3A) to effectively tackle the multi-scenario problem and uses the Dual-Branch Mid-Fusion (DBMF) module to tackle the multi-granularity problem. Specifically, M3A is integrated into the BEATs block as an adapter to improve the BEATs' performance by fine-tuning it on the multi-scenario dataset. The DBMF module connects BEATs and CNN branches, which facilitates the deep fusion of information from the BEATs and the CNN branches. Experimental results show that the proposed methods exceed the baseline of mpAUC by \textbf{$5\%$} on the DESED and MAESTRO Real datasets. Code is available at https://github.com/Visitor-W/MTDA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06196v2-abstract-full').style.display = 'none'; document.getElementById('2409.06196v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to Icassp2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05794">arXiv:2409.05794</a> <span> [<a href="https://arxiv.org/pdf/2409.05794">pdf</a>, <a href="https://arxiv.org/format/2409.05794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> </div> </div> <p class="title is-5 mathjax"> Parf: Adaptive Parameter Refining for Abstract Interpretation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyi Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingshuai Chen</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+Y">Yixuan Bu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiuye Wang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shengchao Qin</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+X">Xiao Yi</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianwei Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05794v1-abstract-short" style="display: inline;"> The core challenge in applying abstract interpretation lies in the configuration of abstraction and analysis strategies encoded by a large number of external parameters of static analysis tools. To attain low false-positive rates (i.e., accuracy) while preserving analysis efficiency, tuning the parameters heavily relies on expert knowledge and is thus difficult to automate. In this paper, we prese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05794v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05794v1-abstract-full" style="display: none;"> The core challenge in applying abstract interpretation lies in the configuration of abstraction and analysis strategies encoded by a large number of external parameters of static analysis tools. To attain low false-positive rates (i.e., accuracy) while preserving analysis efficiency, tuning the parameters heavily relies on expert knowledge and is thus difficult to automate. In this paper, we present a fully automated framework called Parf to adaptively tune the external parameters of abstract interpretation-based static analyzers. Parf models various types of parameters as random variables subject to probability distributions over latticed parameter spaces. It incrementally refines the probability distributions based on accumulated intermediate results generated by repeatedly sampling and analyzing, thereby ultimately yielding a set of highly accurate parameter settings within a given time budget. We have implemented Parf on top of Frama-C/Eva - an off-the-shelf open-source static analyzer for C programs - and compared it against the expert refinement strategy and Frama-C/Eva's official configurations over the Frama-C OSCS benchmark. Experimental results indicate that Parf achieves the lowest number of false positives on 34/37 (91.9%) program repositories with exclusively best results on 12/37 (32.4%) cases. In particular, Parf exhibits promising performance for analyzing complex, large-scale real-world programs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05794v1-abstract-full').style.display = 'none'; document.getElementById('2409.05794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03332">arXiv:2409.03332</a> <span> [<a href="https://arxiv.org/pdf/2409.03332">pdf</a>, <a href="https://arxiv.org/format/2409.03332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Masked Sensory-Temporal Attention for Sensor Generalization in Quadruped Locomotion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dikai Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jianxiong Yin</a>, <a href="/search/cs?searchtype=author&query=See%2C+S">Simon See</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03332v1-abstract-short" style="display: inline;"> With the rising focus on quadrupeds, a generalized policy capable of handling different robot models and sensory inputs will be highly beneficial. Although several methods have been proposed to address different morphologies, it remains a challenge for learning-based policies to manage various combinations of proprioceptive information. This paper presents Masked Sensory-Temporal Attention (MSTA),… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03332v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03332v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03332v1-abstract-full" style="display: none;"> With the rising focus on quadrupeds, a generalized policy capable of handling different robot models and sensory inputs will be highly beneficial. Although several methods have been proposed to address different morphologies, it remains a challenge for learning-based policies to manage various combinations of proprioceptive information. This paper presents Masked Sensory-Temporal Attention (MSTA), a novel transformer-based model with masking for quadruped locomotion. It employs direct sensor-level attention to enhance sensory-temporal understanding and handle different combinations of sensor data, serving as a foundation for incorporating unseen information. This model can effectively understand its states even with a large portion of missing information, and is flexible enough to be deployed on a physical system despite the long input sequence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03332v1-abstract-full').style.display = 'none'; document.getElementById('2409.03332v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project website for video: https://johnliudk.github.io/msta/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00754">arXiv:2409.00754</a> <span> [<a href="https://arxiv.org/pdf/2409.00754">pdf</a>, <a href="https://arxiv.org/format/2409.00754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Cooperative Path Planning with Asynchronous Multiagent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jiaming Yin</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+W">Weixiong Rao</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yu Xiao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+K">Keshuang Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00754v1-abstract-short" style="display: inline;"> In this paper, we study the shortest path problem (SPP) with multiple source-destination pairs (MSD), namely MSD-SPP, to minimize average travel time of all shortest paths. The inherent traffic capacity limits within a road network contributes to the competition among vehicles. Multi-agent reinforcement learning (MARL) model cannot offer effective and efficient path planning cooperation due to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00754v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00754v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00754v1-abstract-full" style="display: none;"> In this paper, we study the shortest path problem (SPP) with multiple source-destination pairs (MSD), namely MSD-SPP, to minimize average travel time of all shortest paths. The inherent traffic capacity limits within a road network contributes to the competition among vehicles. Multi-agent reinforcement learning (MARL) model cannot offer effective and efficient path planning cooperation due to the asynchronous decision making setting in MSD-SPP, where vehicles (a.k.a agents) cannot simultaneously complete routing actions in the previous time step. To tackle the efficiency issue, we propose to divide an entire road network into multiple sub-graphs and subsequently execute a two-stage process of inter-region and intra-region route planning. To address the asynchronous issue, in the proposed asyn-MARL framework, we first design a global state, which exploits a low-dimensional vector to implicitly represent the joint observations and actions of multi-agents. Then we develop a novel trajectory collection mechanism to decrease the redundancy in training trajectories. Additionally, we design a novel actor network to facilitate the cooperation among vehicles towards the same or close destinations and a reachability graph aimed at preventing infinite loops in routing paths. On both synthetic and real road networks, our evaluation result demonstrates that our approach outperforms state-of-the-art planning approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00754v1-abstract-full').style.display = 'none'; document.getElementById('2409.00754v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00565">arXiv:2409.00565</a> <span> [<a href="https://arxiv.org/pdf/2409.00565">pdf</a>, <a href="https://arxiv.org/format/2409.00565">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Two-Stage Hierarchical and Explainable Feature Selection Framework for Dimensionality Reduction in Sleep Staging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yangfan Deng</a>, <a href="/search/cs?searchtype=author&query=Albidah%2C+H">Hamad Albidah</a>, <a href="/search/cs?searchtype=author&query=Dallal%2C+A">Ahmed Dallal</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jijun Yin</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zhi-Hong Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00565v1-abstract-short" style="display: inline;"> Sleep is crucial for human health, and EEG signals play a significant role in sleep research. Due to the high-dimensional nature of EEG signal data sequences, data visualization and clustering of different sleep stages have been challenges. To address these issues, we propose a two-stage hierarchical and explainable feature selection framework by incorporating a feature selection algorithm to impr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00565v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00565v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00565v1-abstract-full" style="display: none;"> Sleep is crucial for human health, and EEG signals play a significant role in sleep research. Due to the high-dimensional nature of EEG signal data sequences, data visualization and clustering of different sleep stages have been challenges. To address these issues, we propose a two-stage hierarchical and explainable feature selection framework by incorporating a feature selection algorithm to improve the performance of dimensionality reduction. Inspired by topological data analysis, which can analyze the structure of high-dimensional data, we extract topological features from the EEG signals to compensate for the structural information loss that happens in traditional spectro-temporal data analysis. Supported by the topological visualization of the data from different sleep stages and the classification results, the proposed features are proven to be effective supplements to traditional features. Finally, we compare the performances of three dimensionality reduction algorithms: Principal Component Analysis (PCA), t-Distributed Stochastic Neighbor Embedding (t-SNE), and Uniform Manifold Approximation and Projection (UMAP). Among them, t-SNE achieved the highest accuracy of 79.8%, but considering the overall performance in terms of computational resources and metrics, UMAP is the optimal choice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00565v1-abstract-full').style.display = 'none'; document.getElementById('2409.00565v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15914">arXiv:2408.15914</a> <span> [<a href="https://arxiv.org/pdf/2408.15914">pdf</a>, <a href="https://arxiv.org/format/2408.15914">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CoRe: Context-Regularized Text Embedding Learning for Text-to-Image Personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+F">Feize Wu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+Y">Yun Pang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+L">Lianyu Pang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jian Yin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Baoquan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xudong Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15914v1-abstract-short" style="display: inline;"> Recent advances in text-to-image personalization have enabled high-quality and controllable image synthesis for user-provided concepts. However, existing methods still struggle to balance identity preservation with text alignment. Our approach is based on the fact that generating prompt-aligned images requires a precise semantic understanding of the prompt, which involves accurately processing the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15914v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15914v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15914v1-abstract-full" style="display: none;"> Recent advances in text-to-image personalization have enabled high-quality and controllable image synthesis for user-provided concepts. However, existing methods still struggle to balance identity preservation with text alignment. Our approach is based on the fact that generating prompt-aligned images requires a precise semantic understanding of the prompt, which involves accurately processing the interactions between the new concept and its surrounding context tokens within the CLIP text encoder. To address this, we aim to embed the new concept properly into the input embedding space of the text encoder, allowing for seamless integration with existing tokens. We introduce Context Regularization (CoRe), which enhances the learning of the new concept's text embedding by regularizing its context tokens in the prompt. This is based on the insight that appropriate output vectors of the text encoder for the context tokens can only be achieved if the new concept's text embedding is correctly learned. CoRe can be applied to arbitrary prompts without requiring the generation of corresponding images, thus improving the generalization of the learned text embedding. Additionally, CoRe can serve as a test-time optimization technique to further enhance the generations for specific prompts. Comprehensive experiments demonstrate that our method outperforms several baseline methods in both identity preservation and text alignment. Code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15914v1-abstract-full').style.display = 'none'; document.getElementById('2408.15914v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yin%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yin%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository