Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 374 results for author: <span class="mathjax">Yan, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yan%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yan, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yan%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yan, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12448">arXiv:2411.12448</a> <span> [<a href="https://arxiv.org/pdf/2411.12448">pdf</a>, <a href="https://arxiv.org/format/2411.12448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models for Lossless Image Compression: Next-Pixel Prediction in Language Space is All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kecheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pingping Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yibing Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jixin Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12448v1-abstract-short" style="display: inline;"> We have recently witnessed that ``Intelligence" and `` Compression" are the two sides of the same coin, where the language large model (LLM) with unprecedented intelligence is a general-purpose lossless compressor for various data modalities. This attribute particularly appeals to the lossless image compression community, given the increasing need to compress high-resolution images in the current… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12448v1-abstract-full" style="display: none;"> We have recently witnessed that ``Intelligence" and `` Compression" are the two sides of the same coin, where the language large model (LLM) with unprecedented intelligence is a general-purpose lossless compressor for various data modalities. This attribute particularly appeals to the lossless image compression community, given the increasing need to compress high-resolution images in the current streaming media era. Consequently, a spontaneous envision emerges: Can the compression performance of the LLM elevate lossless image compression to new heights? However, our findings indicate that the naive application of LLM-based lossless image compressors suffers from a considerable performance gap compared with existing state-of-the-art (SOTA) codecs on common benchmark datasets. In light of this, we are dedicated to fulfilling the unprecedented intelligence (compression) capacity of the LLM for lossless image compression tasks, thereby bridging the gap between theoretical and practical compression performance. Specifically, we propose P$^{2}$-LLM, a next-pixel prediction-based LLM, which integrates various elaborated insights and methodologies, \textit{e.g.,} pixel-level priors, the in-context ability of LLM, and a pixel-level semantic preservation strategy, to enhance the understanding capacity of pixel sequences for better next-pixel predictions. Extensive experiments on benchmark datasets demonstrate that P$^{2}$-LLM can beat SOTA classical and learned codecs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12448v1-abstract-full').style.display = 'none'; document.getElementById('2411.12448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10990">arXiv:2411.10990</a> <span> [<a href="https://arxiv.org/pdf/2411.10990">pdf</a>, <a href="https://arxiv.org/format/2411.10990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Timing-driven Approximate Logic Synthesis Based on Double-chase Grey Wolf Optimizer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangfei Hu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuyang Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tinghuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bei Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10990v1-abstract-short" style="display: inline;"> With the shrinking technology nodes, timing optimization becomes increasingly challenging. Approximate logic synthesis (ALS) can perform local approximate changes (LACs) on circuits to optimize timing with the cost of slight inaccuracy. However, existing ALS methods that focus solely on critical path depth reduction (depth-driven methods) or area minimization (area-driven methods) are inefficient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10990v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10990v1-abstract-full" style="display: none;"> With the shrinking technology nodes, timing optimization becomes increasingly challenging. Approximate logic synthesis (ALS) can perform local approximate changes (LACs) on circuits to optimize timing with the cost of slight inaccuracy. However, existing ALS methods that focus solely on critical path depth reduction (depth-driven methods) or area minimization (area-driven methods) are inefficient in achieving optimal timing improvement. %based on double-chase grey wolf optimizer (DCGWO). where we employ a double-chase grey wolf optimizer to explore and apply LACs, simultaneously bringing excellent critical path shortening and area reduction under error constraints. According to experiments on open-source circuits with TSMC 28nm technology, compared to the SOTA method, our framework can generate approximate circuits with greater critical path delay reduction under different error and area constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10990v1-abstract-full').style.display = 'none'; document.getElementById('2411.10990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10498">arXiv:2411.10498</a> <span> [<a href="https://arxiv.org/pdf/2411.10498">pdf</a>, <a href="https://arxiv.org/format/2411.10498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt-Guided Environmentally Consistent Adversarial Patch </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaoqun Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huanqian Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tairan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhuodong Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hang Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10498v1-abstract-short" style="display: inline;"> Adversarial attacks in the physical world pose a significant threat to the security of vision-based systems, such as facial recognition and autonomous driving. Existing adversarial patch methods primarily focus on improving attack performance, but they often produce patches that are easily detectable by humans and struggle to achieve environmental consistency, i.e., blending patches into the envir… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10498v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10498v1-abstract-full" style="display: none;"> Adversarial attacks in the physical world pose a significant threat to the security of vision-based systems, such as facial recognition and autonomous driving. Existing adversarial patch methods primarily focus on improving attack performance, but they often produce patches that are easily detectable by humans and struggle to achieve environmental consistency, i.e., blending patches into the environment. This paper introduces a novel approach for generating adversarial patches, which addresses both the visual naturalness and environmental consistency of the patches. We propose Prompt-Guided Environmentally Consistent Adversarial Patch (PG-ECAP), a method that aligns the patch with the environment to ensure seamless integration into the environment. The approach leverages diffusion models to generate patches that are both environmental consistency and effective in evading detection. To further enhance the naturalness and consistency, we introduce two alignment losses: Prompt Alignment Loss and Latent Space Alignment Loss, ensuring that the generated patch maintains its adversarial properties while fitting naturally within its environment. Extensive experiments in both digital and physical domains demonstrate that PG-ECAP outperforms existing methods in attack success rate and environmental consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10498v1-abstract-full').style.display = 'none'; document.getElementById('2411.10498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07618">arXiv:2411.07618</a> <span> [<a href="https://arxiv.org/pdf/2411.07618">pdf</a>, <a href="https://arxiv.org/format/2411.07618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Direct Preference Optimization Using Sparse Feature-Level Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+C+T">Chak Tou Leong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hanqi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qiang Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yulan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenjie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07618v1-abstract-short" style="display: inline;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimizat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07618v1-abstract-full" style="display: none;"> The alignment of large language models (LLMs) with human preferences remains a key challenge. While post-training techniques like Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have achieved notable success, they often introduce computational inefficiencies and training instability. In this paper, we propose Feature-level constrained Preference Optimization (FPO), a novel method designed to simplify the alignment process while ensuring stability. FPO leverages pre-trained Sparse Autoencoders (SAEs) and introduces feature-level constraints, allowing for efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using sparse features activated in a well-trained sparse autoencoder and the quality of sequential KL divergence by using the feature-level offline reference. Experimental results on benchmark datasets demonstrate that FPO achieves a 5.08% absolute improvement in win rate with much lower computational cost compared to state-of-the-art baselines, making it a promising solution for efficient and controllable LLM alignments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07618v1-abstract-full').style.display = 'none'; document.getElementById('2411.07618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07037">arXiv:2411.07037</a> <span> [<a href="https://arxiv.org/pdf/2411.07037">pdf</a>, <a href="https://arxiv.org/format/2411.07037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LIFBench: Evaluating the Instruction Following Performance and Stability of Large Language Models in Long-Context Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaodong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Minhao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yichen Liu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiaoming Shi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">He Yan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiangju Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Junmin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07037v1-abstract-short" style="display: inline;"> As Large Language Models (LLMs) continue to advance in natural language processing (NLP), their ability to stably follow instructions in long-context inputs has become crucial for real-world applications. While existing benchmarks assess various LLM capabilities, they rarely focus on instruction-following in long-context scenarios or stability on different inputs. In response, we introduce the Lon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07037v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07037v1-abstract-full" style="display: none;"> As Large Language Models (LLMs) continue to advance in natural language processing (NLP), their ability to stably follow instructions in long-context inputs has become crucial for real-world applications. While existing benchmarks assess various LLM capabilities, they rarely focus on instruction-following in long-context scenarios or stability on different inputs. In response, we introduce the Long-context Instruction-Following Benchmark (LIFBench), a scalable dataset designed to evaluate LLMs' instruction-following capabilities and stability across long contexts. LIFBench comprises three long-context scenarios and eleven diverse tasks, supported by 2,766 instructions generated through an automated expansion method across three dimensions: length, expression, and variables. For evaluation, we propose LIFEval, a rubric-based assessment framework that provides precise, automated scoring of complex LLM responses without relying on LLM-assisted evaluations or human judgments. This approach facilitates a comprehensive analysis of model performance and stability across various perspectives. We conduct extensive experiments on 20 notable LLMs across six length intervals, analyzing their instruction-following capabilities and stability. Our work contributes LIFBench and LIFEval as robust tools for assessing LLM performance in complex, long-context settings, providing insights that can inform future LLM development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07037v1-abstract-full').style.display = 'none'; document.getElementById('2411.07037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06187">arXiv:2411.06187</a> <span> [<a href="https://arxiv.org/pdf/2411.06187">pdf</a>, <a href="https://arxiv.org/format/2411.06187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> BM-PAW: A Profitable Mining Attack in the PoW-based Blockchain System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junjie Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xunzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huan Yan</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+N">Na Ruan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06187v1-abstract-short" style="display: inline;"> Mining attacks enable an adversary to procure a disproportionately large portion of mining rewards by deviating from honest mining practices within the PoW-based blockchain system. In this paper, we demonstrate that the security vulnerabilities of PoW-based blockchain extend beyond what these mining attacks initially reveal. We introduce a novel mining strategy, named BM-PAW, which yields superior… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06187v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06187v1-abstract-full" style="display: none;"> Mining attacks enable an adversary to procure a disproportionately large portion of mining rewards by deviating from honest mining practices within the PoW-based blockchain system. In this paper, we demonstrate that the security vulnerabilities of PoW-based blockchain extend beyond what these mining attacks initially reveal. We introduce a novel mining strategy, named BM-PAW, which yields superior rewards for both the attacker and the targeted pool compared to the state-of-the-art mining attack: PAW. Our analysis reveals that BM-PAW attackers are incentivized to offer appropriate bribe money to other targets, as they comply with the attacker's directives upon receiving payment. We find the BM-PAW attacker can circumvent the "miner's dilemma" through equilibrium analysis in a two-pool BM-PAW game scenario, wherein the outcome is determined by the attacker's mining power. We finally propose practical countermeasures to mitigate these novel pool attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06187v1-abstract-full').style.display = 'none'; document.getElementById('2411.06187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05328">arXiv:2411.05328</a> <span> [<a href="https://arxiv.org/pdf/2411.05328">pdf</a>, <a href="https://arxiv.org/format/2411.05328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Content Quality vs. Attention Allocation: An LLM-Based Case Study in Peer-to-peer Mental Health Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+T">Teng Ye</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hanson Yan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuhuan Huang</a>, <a href="/search/cs?searchtype=author&query=Grogan%2C+C">Connor Grogan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Walter Yuan</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Q">Qiaozhu Mei</a>, <a href="/search/cs?searchtype=author&query=Jackson%2C+M+O">Matthew O. Jackson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05328v1-abstract-short" style="display: inline;"> With the rise of social media and peer-to-peer networks, users increasingly rely on crowdsourced responses for information and assistance. However, the mechanisms used to rank and promote responses often prioritize and end up biasing in favor of timeliness over quality, which may result in suboptimal support for help-seekers. We analyze millions of responses to mental health-related posts, utilizi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05328v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05328v1-abstract-full" style="display: none;"> With the rise of social media and peer-to-peer networks, users increasingly rely on crowdsourced responses for information and assistance. However, the mechanisms used to rank and promote responses often prioritize and end up biasing in favor of timeliness over quality, which may result in suboptimal support for help-seekers. We analyze millions of responses to mental health-related posts, utilizing large language models (LLMs) to assess the multi-dimensional quality of content, including relevance, empathy, and cultural alignment, among other aspects. Our findings reveal a mismatch between content quality and attention allocation: earlier responses - despite being relatively lower in quality - receive disproportionately high fractions of upvotes and visibility due to platform ranking algorithms. We demonstrate that the quality of the top-ranked responses could be improved by up to 39 percent, and even the simplest re-ranking strategy could significantly improve the quality of top responses, highlighting the need for more nuanced ranking mechanisms that prioritize both timeliness and content quality, especially emotional engagement in online mental health communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05328v1-abstract-full').style.display = 'none'; document.getElementById('2411.05328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 91D30; 94A16 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03236">arXiv:2411.03236</a> <span> [<a href="https://arxiv.org/pdf/2411.03236">pdf</a>, <a href="https://arxiv.org/format/2411.03236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Transformer Training Efficiency with Dynamic Dropout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hanrui Yan</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+D">Dan Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03236v1-abstract-short" style="display: inline;"> We introduce Dynamic Dropout, a novel regularization technique designed to enhance the training efficiency of Transformer models by dynamically adjusting the dropout rate based on training epochs or validation loss improvements. This approach addresses the challenge of balancing regularization and model capacity, which is crucial for achieving fast convergence and high performance. Our method invo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03236v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03236v1-abstract-full" style="display: none;"> We introduce Dynamic Dropout, a novel regularization technique designed to enhance the training efficiency of Transformer models by dynamically adjusting the dropout rate based on training epochs or validation loss improvements. This approach addresses the challenge of balancing regularization and model capacity, which is crucial for achieving fast convergence and high performance. Our method involves modifying the GPT model to accept a variable dropout rate and updating dropout layers during training using schedules such as linear decay, exponential decay, and validation loss-based adjustments. Extensive experiments on the Shakespeare\_char dataset demonstrate that Dynamic Dropout significantly accelerates training and improves inference efficiency compared to a baseline model with a fixed dropout rate. The validation loss-based adjustment schedule provided the best overall performance, highlighting the potential of Dynamic Dropout as a valuable technique for training large-scale Transformer models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03236v1-abstract-full').style.display = 'none'; document.getElementById('2411.03236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18695">arXiv:2410.18695</a> <span> [<a href="https://arxiv.org/pdf/2410.18695">pdf</a>, <a href="https://arxiv.org/format/2410.18695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PESFormer: Boosting Macro- and Micro-expression Spotting with Direct Timestamp Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wang-Wang Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kai-Fu Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangrui Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jingwen Jiang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong-Mei Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong-Jie Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18695v1-abstract-short" style="display: inline;"> The task of macro- and micro-expression spotting aims to precisely localize and categorize temporal expression instances within untrimmed videos. Given the sparse distribution and varying durations of expressions, existing anchor-based methods often represent instances by encoding their deviations from predefined anchors. Additionally, these methods typically slice the untrimmed videos into fixed-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18695v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18695v1-abstract-full" style="display: none;"> The task of macro- and micro-expression spotting aims to precisely localize and categorize temporal expression instances within untrimmed videos. Given the sparse distribution and varying durations of expressions, existing anchor-based methods often represent instances by encoding their deviations from predefined anchors. Additionally, these methods typically slice the untrimmed videos into fixed-length sliding windows. However, anchor-based encoding often fails to capture all training intervals, and slicing the original video as sliding windows can result in valuable training intervals being discarded. To overcome these limitations, we introduce PESFormer, a simple yet effective model based on the vision transformer architecture to achieve point-to-interval expression spotting. PESFormer employs a direct timestamp encoding (DTE) approach to replace anchors, enabling binary classification of each timestamp instead of optimizing entire ground truths. Thus, all training intervals are retained in the form of discrete timestamps. To maximize the utilization of training intervals, we enhance the preprocessing process by replacing the short videos produced through the sliding window method.Instead, we implement a strategy that involves zero-padding the untrimmed training videos to create uniform, longer videos of a predetermined duration. This operation efficiently preserves the original training intervals and eliminates video slice enhancement.Extensive qualitative and quantitative evaluations on three datasets -- CAS(ME)^2, CAS(ME)^3 and SAMM-LV -- demonstrate that our PESFormer outperforms existing techniques, achieving the best performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18695v1-abstract-full').style.display = 'none'; document.getElementById('2410.18695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18113">arXiv:2410.18113</a> <span> [<a href="https://arxiv.org/pdf/2410.18113">pdf</a>, <a href="https://arxiv.org/format/2410.18113">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scalable Co-Clustering for Large-Scale Data through Dynamic Partitioning and Hierarchical Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihan Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhaoke Huang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18113v1-abstract-short" style="display: inline;"> Co-clustering simultaneously clusters rows and columns, revealing more fine-grained groups. However, existing co-clustering methods suffer from poor scalability and cannot handle large-scale data. This paper presents a novel and scalable co-clustering method designed to uncover intricate patterns in high-dimensional, large-scale datasets. Specifically, we first propose a large matrix partitioning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18113v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18113v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18113v1-abstract-full" style="display: none;"> Co-clustering simultaneously clusters rows and columns, revealing more fine-grained groups. However, existing co-clustering methods suffer from poor scalability and cannot handle large-scale data. This paper presents a novel and scalable co-clustering method designed to uncover intricate patterns in high-dimensional, large-scale datasets. Specifically, we first propose a large matrix partitioning algorithm that partitions a large matrix into smaller submatrices, enabling parallel co-clustering. This method employs a probabilistic model to optimize the configuration of submatrices, balancing the computational efficiency and depth of analysis. Additionally, we propose a hierarchical co-cluster merging algorithm that efficiently identifies and merges co-clusters from these submatrices, enhancing the robustness and reliability of the process. Extensive evaluations validate the effectiveness and efficiency of our method. Experimental results demonstrate a significant reduction in computation time, with an approximate 83% decrease for dense matrices and up to 30% for sparse matrices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18113v1-abstract-full').style.display = 'none'; document.getElementById('2410.18113v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> H.2.8 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17144">arXiv:2410.17144</a> <span> [<a href="https://arxiv.org/pdf/2410.17144">pdf</a>, <a href="https://arxiv.org/format/2410.17144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> YOLO-TS: Real-Time Traffic Sign Detection with Enhanced Accuracy Using Optimized Receptive Fields and Anchor-Free Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junzhou Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ronghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+N">Nengchao Lyu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yanyong Guo</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+H">Hong-Ning Dai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17144v1-abstract-short" style="display: inline;"> Ensuring safety in both autonomous driving and advanced driver-assistance systems (ADAS) depends critically on the efficient deployment of traffic sign recognition technology. While current methods show effectiveness, they often compromise between speed and accuracy. To address this issue, we present a novel real-time and efficient road sign detection network, YOLO-TS. This network significantly i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17144v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17144v1-abstract-full" style="display: none;"> Ensuring safety in both autonomous driving and advanced driver-assistance systems (ADAS) depends critically on the efficient deployment of traffic sign recognition technology. While current methods show effectiveness, they often compromise between speed and accuracy. To address this issue, we present a novel real-time and efficient road sign detection network, YOLO-TS. This network significantly improves performance by optimizing the receptive fields of multi-scale feature maps to align more closely with the size distribution of traffic signs in various datasets. Moreover, our innovative feature-fusion strategy, leveraging the flexibility of Anchor-Free methods, allows for multi-scale object detection on a high-resolution feature map abundant in contextual information, achieving remarkable enhancements in both accuracy and speed. To mitigate the adverse effects of the grid pattern caused by dilated convolutions on the detection of smaller objects, we have devised a unique module that not only mitigates this grid effect but also widens the receptive field to encompass an extensive range of spatial contextual information, thus boosting the efficiency of information usage. Evaluation on challenging public datasets, TT100K and CCTSDB2021, demonstrates that YOLO-TS surpasses existing state-of-the-art methods in terms of both accuracy and speed. The code for our method will be available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17144v1-abstract-full').style.display = 'none'; document.getElementById('2410.17144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures and 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15645">arXiv:2410.15645</a> <span> [<a href="https://arxiv.org/pdf/2410.15645">pdf</a>, <a href="https://arxiv.org/format/2410.15645">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Boosting Jailbreak Transferability for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+L">Lifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huanqian Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15645v2-abstract-short" style="display: inline;"> Large language models have drawn significant attention to the challenge of safe alignment, especially regarding jailbreak attacks that circumvent security measures to produce harmful content. To address the limitations of existing methods like GCG, which perform well in single-model attacks but lack transferability, we propose several enhancements, including a scenario induction template, optimize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15645v2-abstract-full').style.display = 'inline'; document.getElementById('2410.15645v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15645v2-abstract-full" style="display: none;"> Large language models have drawn significant attention to the challenge of safe alignment, especially regarding jailbreak attacks that circumvent security measures to produce harmful content. To address the limitations of existing methods like GCG, which perform well in single-model attacks but lack transferability, we propose several enhancements, including a scenario induction template, optimized suffix selection, and the integration of re-suffix attack mechanism to reduce inconsistent outputs. Our approach has shown superior performance in extensive experiments across various benchmarks, achieving nearly 100% success rates in both attack execution and transferability. Notably, our method has won the first place in the AISG-hosted Global Challenge for Safe and Secure LLMs. The code is released at https://github.com/HqingLiu/SI-GCG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15645v2-abstract-full').style.display = 'none'; document.getElementById('2410.15645v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12191">arXiv:2410.12191</a> <span> [<a href="https://arxiv.org/pdf/2410.12191">pdf</a>, <a href="https://arxiv.org/format/2410.12191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Test-time adaptation for image compression with distribution regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kecheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pingping Zhang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tiexin Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12191v1-abstract-short" style="display: inline;"> Current test- or compression-time adaptation image compression (TTA-IC) approaches, which leverage both latent and decoder refinements as a two-step adaptation scheme, have potentially enhanced the rate-distortion (R-D) performance of learned image compression models on cross-domain compression tasks, \textit{e.g.,} from natural to screen content images. However, compared with the emergence of var… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12191v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12191v1-abstract-full" style="display: none;"> Current test- or compression-time adaptation image compression (TTA-IC) approaches, which leverage both latent and decoder refinements as a two-step adaptation scheme, have potentially enhanced the rate-distortion (R-D) performance of learned image compression models on cross-domain compression tasks, \textit{e.g.,} from natural to screen content images. However, compared with the emergence of various decoder refinement variants, the latent refinement, as an inseparable ingredient, is barely tailored to cross-domain scenarios. To this end, we aim to develop an advanced latent refinement method by extending the effective hybrid latent refinement (HLR) method, which is designed for \textit{in-domain} inference improvement but shows noticeable degradation of the rate cost in \textit{cross-domain} tasks. Specifically, we first provide theoretical analyses, in a cue of marginalization approximation from in- to cross-domain scenarios, to uncover that the vanilla HLR suffers from an underlying mismatch between refined Gaussian conditional and hyperprior distributions, leading to deteriorated joint probability approximation of marginal distribution with increased rate consumption. To remedy this issue, we introduce a simple Bayesian approximation-endowed \textit{distribution regularization} to encourage learning a better joint probability approximation in a plug-and-play manner. Extensive experiments on six in- and cross-domain datasets demonstrate that our proposed method not only improves the R-D performance compared with other latent refinement counterparts, but also can be flexibly integrated into existing TTA-IC methods with incremental benefits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12191v1-abstract-full').style.display = 'none'; document.getElementById('2410.12191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09132">arXiv:2410.09132</a> <span> [<a href="https://arxiv.org/pdf/2410.09132">pdf</a>, <a href="https://arxiv.org/format/2410.09132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> When Graph meets Multimodal: Benchmarking on Multimodal Attributed Graphs Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhigang Yu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruochen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weihao Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingzheng Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhengxin Zeng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hao Sun</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Weiwei Deng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Feng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senzhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09132v1-abstract-short" style="display: inline;"> Multimodal attributed graphs (MAGs) are prevalent in various real-world scenarios and generally contain two kinds of knowledge: (a) Attribute knowledge is mainly supported by the attributes of different modalities contained in nodes (entities) themselves, such as texts and images. (b) Topology knowledge, on the other hand, is provided by the complex interactions posed between nodes. The cornerston… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09132v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09132v1-abstract-full" style="display: none;"> Multimodal attributed graphs (MAGs) are prevalent in various real-world scenarios and generally contain two kinds of knowledge: (a) Attribute knowledge is mainly supported by the attributes of different modalities contained in nodes (entities) themselves, such as texts and images. (b) Topology knowledge, on the other hand, is provided by the complex interactions posed between nodes. The cornerstone of MAG representation learning lies in the seamless integration of multimodal attributes and topology. Recent advancements in Pre-trained Language/Vision models (PLMs/PVMs) and Graph neural networks (GNNs) have facilitated effective learning on MAGs, garnering increased research interest. However, the absence of meaningful benchmark datasets and standardized evaluation procedures for MAG representation learning has impeded progress in this field. In this paper, we propose Multimodal Attribute Graph Benchmark (MAGB)}, a comprehensive and diverse collection of challenging benchmark datasets for MAGs. The MAGB datasets are notably large in scale and encompass a wide range of domains, spanning from e-commerce networks to social networks. In addition to the brand-new datasets, we conduct extensive benchmark experiments over MAGB with various learning paradigms, ranging from GNN-based and PLM-based methods, to explore the necessity and feasibility of integrating multimodal attributes and graph topology. In a nutshell, we provide an overview of the MAG datasets, standardized evaluation procedures, and present baseline experiments. The entire MAGB project is publicly accessible at https://github.com/sktsherlock/ATG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09132v1-abstract-full').style.display = 'none'; document.getElementById('2410.09132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05970">arXiv:2410.05970</a> <span> [<a href="https://arxiv.org/pdf/2410.05970">pdf</a>, <a href="https://arxiv.org/format/2410.05970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PDF-WuKong: A Large Multimodal Model for Efficient Long PDF Reading with End-to-End Sparse Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xudong Xie</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+L">Liang Yin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jing Ding</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+M">Minghui Liao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuliang Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05970v1-abstract-short" style="display: inline;"> Document understanding is a challenging task to process and comprehend large amounts of textual and visual information. Recent advances in Large Language Models (LLMs) have significantly improved the performance of this task. However, existing methods typically focus on either plain text or a limited number of document images, struggling to handle long PDF documents with interleaved text and image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05970v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05970v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05970v1-abstract-full" style="display: none;"> Document understanding is a challenging task to process and comprehend large amounts of textual and visual information. Recent advances in Large Language Models (LLMs) have significantly improved the performance of this task. However, existing methods typically focus on either plain text or a limited number of document images, struggling to handle long PDF documents with interleaved text and images, especially in academic papers. In this paper, we introduce PDF-WuKong, a multimodal large language model (MLLM) which is designed to enhance multimodal question-answering (QA) for long PDF documents. PDF-WuKong incorporates a sparse sampler that operates on both text and image representations, significantly improving the efficiency and capability of the MLLM. The sparse sampler is integrated with the MLLM's image encoder and selects the paragraphs or diagrams most pertinent to user queries for processing by the language model. To effectively train and evaluate our model, we construct PaperPDF, a dataset consisting of a broad collection of academic papers sourced from arXiv, multiple strategies are proposed to generate automatically 1M QA pairs along with their corresponding evidence sources. Experimental results demonstrate the superiority and high efficiency of our approach over other models on the task of long multimodal PDF understanding, surpassing proprietary products by an average of 8.6% on F1. Our code and dataset will be released at https://github.com/yh-hust/PDF-Wukong. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05970v1-abstract-full').style.display = 'none'; document.getElementById('2410.05970v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05731">arXiv:2410.05731</a> <span> [<a href="https://arxiv.org/pdf/2410.05731">pdf</a>, <a href="https://arxiv.org/format/2410.05731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing SPARQL Generation by Triplet-order-sensitive Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+C">Chang Su</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jiexing Qi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">He Yan</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+K">Kai Zou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhouhan Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05731v1-abstract-short" style="display: inline;"> Semantic parsing that translates natural language queries to SPARQL is of great importance for Knowledge Graph Question Answering (KGQA) systems. Although pre-trained language models like T5 have achieved significant success in the Text-to-SPARQL task, their generated outputs still exhibit notable errors specific to the SPARQL language, such as triplet flips. To address this challenge and further… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05731v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05731v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05731v1-abstract-full" style="display: none;"> Semantic parsing that translates natural language queries to SPARQL is of great importance for Knowledge Graph Question Answering (KGQA) systems. Although pre-trained language models like T5 have achieved significant success in the Text-to-SPARQL task, their generated outputs still exhibit notable errors specific to the SPARQL language, such as triplet flips. To address this challenge and further improve the performance, we propose an additional pre-training stage with a new objective, Triplet Order Correction (TOC), along with the commonly used Masked Language Modeling (MLM), to collectively enhance the model's sensitivity to triplet order and SPARQL syntax. Our method achieves state-of-the-art performances on three widely-used benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05731v1-abstract-full').style.display = 'none'; document.getElementById('2410.05731v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by CIKM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02847">arXiv:2410.02847</a> <span> [<a href="https://arxiv.org/pdf/2410.02847">pdf</a>, <a href="https://arxiv.org/format/2410.02847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Signature: Characterization of Large-Scale Molecular Dynamics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tiexin Qin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Mengxu Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chunyang Li</a>, <a href="/search/cs?searchtype=author&query=Lyons%2C+T">Terry Lyons</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02847v1-abstract-short" style="display: inline;"> Understanding protein dynamics are essential for deciphering protein functional mechanisms and developing molecular therapies. However, the complex high-dimensional dynamics and interatomic interactions of biological processes pose significant challenge for existing computational techniques. In this paper, we approach this problem for the first time by introducing Deep Signature, a novel computati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02847v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02847v1-abstract-full" style="display: none;"> Understanding protein dynamics are essential for deciphering protein functional mechanisms and developing molecular therapies. However, the complex high-dimensional dynamics and interatomic interactions of biological processes pose significant challenge for existing computational techniques. In this paper, we approach this problem for the first time by introducing Deep Signature, a novel computationally tractable framework that characterizes complex dynamics and interatomic interactions based on their evolving trajectories. Specifically, our approach incorporates soft spectral clustering that locally aggregates cooperative dynamics to reduce the size of the system, as well as signature transform that collects iterated integrals to provide a global characterization of the non-smooth interactive dynamics. Theoretical analysis demonstrates that Deep Signature exhibits several desirable properties, including invariance to translation, near invariance to rotation, equivariance to permutation of atomic coordinates, and invariance under time reparameterization. Furthermore, experimental results on three benchmarks of biological processes verify that our approach can achieve superior performance compared to baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02847v1-abstract-full').style.display = 'none'; document.getElementById('2410.02847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 page, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01404">arXiv:2410.01404</a> <span> [<a href="https://arxiv.org/pdf/2410.01404">pdf</a>, <a href="https://arxiv.org/format/2410.01404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gaussian-Det: Learning Closed-Surface Gaussians for 3D Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongru Yan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yu Zheng</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yueqi Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01404v1-abstract-short" style="display: inline;"> Skins wrapping around our bodies, leathers covering over the sofa, sheet metal coating the car - it suggests that objects are enclosed by a series of continuous surfaces, which provides us with informative geometry prior for objectness deduction. In this paper, we propose Gaussian-Det which leverages Gaussian Splatting as surface representation for multi-view based 3D object detection. Unlike exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01404v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01404v1-abstract-full" style="display: none;"> Skins wrapping around our bodies, leathers covering over the sofa, sheet metal coating the car - it suggests that objects are enclosed by a series of continuous surfaces, which provides us with informative geometry prior for objectness deduction. In this paper, we propose Gaussian-Det which leverages Gaussian Splatting as surface representation for multi-view based 3D object detection. Unlike existing monocular or NeRF-based methods which depict the objects via discrete positional data, Gaussian-Det models the objects in a continuous manner by formulating the input Gaussians as feature descriptors on a mass of partial surfaces. Furthermore, to address the numerous outliers inherently introduced by Gaussian splatting, we accordingly devise a Closure Inferring Module (CIM) for the comprehensive surface-based objectness deduction. CIM firstly estimates the probabilistic feature residuals for partial surfaces given the underdetermined nature of Gaussian Splatting, which are then coalesced into a holistic representation on the overall surface closure of the object proposal. In this way, the surface information Gaussian-Det exploits serves as the prior on the quality and reliability of objectness and the information basis of proposal refinement. Experiments on both synthetic and real-world datasets demonstrate that Gaussian-Det outperforms various existing approaches, in terms of both average precision and recall. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01404v1-abstract-full').style.display = 'none'; document.getElementById('2410.01404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00166">arXiv:2410.00166</a> <span> [<a href="https://arxiv.org/pdf/2410.00166">pdf</a>, <a href="https://arxiv.org/format/2410.00166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> EEG Emotion Copilot: Pruning LLMs for Emotional EEG Interpretation with Assisted Medical Record Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+L">Luhui Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yueyang Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongjie Yan</a>, <a href="/search/cs?searchtype=author&query=Siok%2C+W+T">Wai Ting Siok</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nizhuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00166v1-abstract-short" style="display: inline;"> In the fields of affective computing (AC) and brain-machine interface (BMI), the analysis of physiological and behavioral signals to discern individual emotional states has emerged as a critical research frontier. While deep learning-based approaches have made notable strides in EEG emotion recognition, particularly in feature extraction and pattern recognition, significant challenges persist in a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00166v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00166v1-abstract-full" style="display: none;"> In the fields of affective computing (AC) and brain-machine interface (BMI), the analysis of physiological and behavioral signals to discern individual emotional states has emerged as a critical research frontier. While deep learning-based approaches have made notable strides in EEG emotion recognition, particularly in feature extraction and pattern recognition, significant challenges persist in achieving end-to-end emotion computation, including real-time processing, individual adaptation, and seamless user interaction. This paper presents the EEG Emotion Copilot, a system leveraging a lightweight large language model (LLM) operating in a local setting. The system is designed to first recognize emotional states directly from EEG signals, subsequently generate personalized diagnostic and treatment suggestions, and finally support the automation of electronic medical records. The proposed solution emphasizes both the accuracy of emotion recognition and an enhanced user experience, facilitated by an intuitive interface for participant interaction. We further discuss the construction of the data framework, model pruning, training, and deployment strategies aimed at improving real-time performance and computational efficiency. Privacy concerns are also addressed, with a focus on ethical data collection, processing, and the protection of users' personal information. Through these efforts, we aim to advance the application of AC in the medical domain, offering innovative approaches to mental health diagnostics and treatment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00166v1-abstract-full').style.display = 'none'; document.getElementById('2410.00166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00003">arXiv:2410.00003</a> <span> [<a href="https://arxiv.org/pdf/2410.00003">pdf</a>, <a href="https://arxiv.org/format/2410.00003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Language-centered Human Activity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hua Yan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Heng Tan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pengfei Zhou</a>, <a href="/search/cs?searchtype=author&query=Namboodiri%2C+V">Vinod Namboodiri</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00003v2-abstract-short" style="display: inline;"> Human Activity Recognition (HAR) using Inertial Measurement Unit (IMU) sensors is critical for applications in healthcare, safety, and industrial production. However, variations in activity patterns, device types, and sensor placements create distribution gaps across datasets, reducing the performance of HAR models. To address this, we propose LanHAR, a novel system that leverages Large Language M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00003v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00003v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00003v2-abstract-full" style="display: none;"> Human Activity Recognition (HAR) using Inertial Measurement Unit (IMU) sensors is critical for applications in healthcare, safety, and industrial production. However, variations in activity patterns, device types, and sensor placements create distribution gaps across datasets, reducing the performance of HAR models. To address this, we propose LanHAR, a novel system that leverages Large Language Models (LLMs) to generate semantic interpretations of sensor readings and activity labels for cross-dataset HAR. This approach not only mitigates cross-dataset heterogeneity but also enhances the recognition of new activities. LanHAR employs an iterative re-generation method to produce high-quality semantic interpretations with LLMs and a two-stage training framework that bridges the semantic interpretations of sensor readings and activity labels. This ultimately leads to a lightweight sensor encoder suitable for mobile deployment, enabling any sensor reading to be mapped into the semantic interpretation space. Experiments on four public datasets demonstrate that our approach significantly outperforms state-of-the-art methods in both cross-dataset HAR and new activity recognition. The source code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00003v2-abstract-full').style.display = 'none'; document.getElementById('2410.00003v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20043">arXiv:2409.20043</a> <span> [<a href="https://arxiv.org/pdf/2409.20043">pdf</a>, <a href="https://arxiv.org/format/2409.20043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OPONeRF: One-Point-One NeRF for Robust Neural Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yu Zheng</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yueqi Duan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+K">Kangfu Zheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongru Yan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiwen Lu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jie Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20043v2-abstract-short" style="display: inline;"> In this paper, we propose a One-Point-One NeRF (OPONeRF) framework for robust scene rendering. Existing NeRFs are designed based on a key assumption that the target scene remains unchanged between the training and test time. However, small but unpredictable perturbations such as object movements, light changes and data contaminations broadly exist in real-life 3D scenes, which lead to significantl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20043v2-abstract-full').style.display = 'inline'; document.getElementById('2409.20043v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20043v2-abstract-full" style="display: none;"> In this paper, we propose a One-Point-One NeRF (OPONeRF) framework for robust scene rendering. Existing NeRFs are designed based on a key assumption that the target scene remains unchanged between the training and test time. However, small but unpredictable perturbations such as object movements, light changes and data contaminations broadly exist in real-life 3D scenes, which lead to significantly defective or failed rendering results even for the recent state-of-the-art generalizable methods. To address this, we propose a divide-and-conquer framework in OPONeRF that adaptively responds to local scene variations via personalizing appropriate point-wise parameters, instead of fitting a single set of NeRF parameters that are inactive to test-time unseen changes. Moreover, to explicitly capture the local uncertainty, we decompose the point representation into deterministic mapping and probabilistic inference. In this way, OPONeRF learns the sharable invariance and unsupervisedly models the unexpected scene variations between the training and testing scenes. To validate the effectiveness of the proposed method, we construct benchmarks from both realistic and synthetic data with diverse test-time perturbations including foreground motions, illumination variations and multi-modality noises, which are more challenging than conventional generalization and temporal reconstruction benchmarks. Experimental results show that our OPONeRF outperforms state-of-the-art NeRFs on various evaluation metrics through benchmark experiments and cross-scene evaluations. We further show the efficacy of the proposed method via experimenting on other existing generalization-based benchmarks and incorporating the idea of One-Point-One NeRF into other advanced baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20043v2-abstract-full').style.display = 'none'; document.getElementById('2409.20043v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page and dataset: https://yzheng97.github.io/OPONeRF/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17497">arXiv:2409.17497</a> <span> [<a href="https://arxiv.org/pdf/2409.17497">pdf</a>, <a href="https://arxiv.org/format/2409.17497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Precise Interception Flight Targets by Image-based Visual Servoing of Multicopter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hailong Yan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kun Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yixiao Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17497v1-abstract-short" style="display: inline;"> Interception of low-altitude intruding targets with low-cost drones equipped strapdown camera presents a competitive option. However, the malicious maneuvers by the non-cooperative target and the coupling of the camera make the task challenging. To solve this problem, an Image-Based Visual Servoing (IBVS) control algorithm based on proportional navigation guidance with field-of-view holding capabi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17497v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17497v1-abstract-full" style="display: none;"> Interception of low-altitude intruding targets with low-cost drones equipped strapdown camera presents a competitive option. However, the malicious maneuvers by the non-cooperative target and the coupling of the camera make the task challenging. To solve this problem, an Image-Based Visual Servoing (IBVS) control algorithm based on proportional navigation guidance with field-of-view holding capability is designed. The proposed controller reduces the miss distance while improving the stability of the visual servo system during interception. Software-in-the-loop (SITL) simulation experiments show a 72.8% reduction in the circular error probability (CEP) compared to the most recent study. This improvement enhances interception accuracy from the decimeter to the centimeter level. Real-world experiments further validate the effectiveness of the proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17497v1-abstract-full').style.display = 'none'; document.getElementById('2409.17497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 15 figures, In the process of being submitted to the Journal of IEEE Transactions on Industrial Electronics</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15373">arXiv:2409.15373</a> <span> [<a href="https://arxiv.org/pdf/2409.15373">pdf</a>, <a href="https://arxiv.org/ps/2409.15373">ps</a>, <a href="https://arxiv.org/format/2409.15373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3640457.3688040">10.1145/3640457.3688040 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing Performance and Scalability of Large-Scale Recommendation Systems with Jagged Flash Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+R">Rengan Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Junjie Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifan Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xing Liu</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+D">Devashish Shankar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoci Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Meng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Boyang Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxi Hu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Mingwei Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zehua Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tunhou Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dai Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sijia Chen</a>, <a href="/search/cs?searchtype=author&query=Musumeci%2C+G">Gian-Paolo Musumeci</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+J">Jiaqi Zhai</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bill Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a>, <a href="/search/cs?searchtype=author&query=Reddy%2C+S">Srihari Reddy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15373v1-abstract-short" style="display: inline;"> The integration of hardware accelerators has significantly advanced the capabilities of modern recommendation systems, enabling the exploration of complex ranking paradigms previously deemed impractical. However, the GPU-based computational costs present substantial challenges. In this paper, we demonstrate our development of an efficiency-driven approach to explore these paradigms, moving beyond… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15373v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15373v1-abstract-full" style="display: none;"> The integration of hardware accelerators has significantly advanced the capabilities of modern recommendation systems, enabling the exploration of complex ranking paradigms previously deemed impractical. However, the GPU-based computational costs present substantial challenges. In this paper, we demonstrate our development of an efficiency-driven approach to explore these paradigms, moving beyond traditional reliance on native PyTorch modules. We address the specific challenges posed by ranking models' dependence on categorical features, which vary in length and complicate GPU utilization. We introduce Jagged Feature Interaction Kernels, a novel method designed to extract fine-grained insights from long categorical features through efficient handling of dynamically sized tensors. We further enhance the performance of attention mechanisms by integrating Jagged tensors with Flash Attention. Our novel Jagged Flash Attention achieves up to 9x speedup and 22x memory reduction compared to dense attention. Notably, it also outperforms dense flash attention, with up to 3x speedup and 53% more memory efficiency. In production models, we observe 10% QPS improvement and 18% memory savings, enabling us to scale our recommendation systems with longer features and more complex architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15373v1-abstract-full').style.display = 'none'; document.getElementById('2409.15373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15169">arXiv:2409.15169</a> <span> [<a href="https://arxiv.org/pdf/2409.15169">pdf</a>, <a href="https://arxiv.org/format/2409.15169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> CamLoPA: A Hidden Wireless Camera Localization Framework via Signal Propagation Path Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zehua Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinyang Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huan Yan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Peng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zijian Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qing Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bin Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+N">Nenghai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15169v1-abstract-short" style="display: inline;"> Hidden wireless cameras pose significant privacy threats, necessitating effective detection and localization methods. However, existing solutions often require spacious activity areas, expensive specialized devices, or pre-collected training data, limiting their practical deployment. To address these limitations, we introduce CamLoPA, a training-free wireless camera detection and localization fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15169v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15169v1-abstract-full" style="display: none;"> Hidden wireless cameras pose significant privacy threats, necessitating effective detection and localization methods. However, existing solutions often require spacious activity areas, expensive specialized devices, or pre-collected training data, limiting their practical deployment. To address these limitations, we introduce CamLoPA, a training-free wireless camera detection and localization framework that operates with minimal activity space constraints using low-cost commercial-off-the-shelf (COTS) devices. CamLoPA can achieve detection and localization in just 45 seconds of user activities with a Raspberry Pi board. During this short period, it analyzes the causal relationship between the wireless traffic and user movement to detect the presence of a snooping camera. Upon detection, CamLoPA employs a novel azimuth location model based on wireless signal propagation path analysis. Specifically, this model leverages the time ratio of user paths crossing the First Fresnel Zone (FFZ) to determine the azimuth angle of the camera. Then CamLoPA refines the localization by identifying the camera's quadrant. We evaluate CamLoPA across various devices and environments, demonstrating that it achieves 95.37% snooping camera detection accuracy and an average localization error of 17.23, under the significantly reduced activity space requirements. Our demo are available at https://www.youtube.com/watch?v=GKam04FzeM4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15169v1-abstract-full').style.display = 'none'; document.getElementById('2409.15169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13285">arXiv:2409.13285</a> <span> [<a href="https://arxiv.org/pdf/2409.13285">pdf</a>, <a href="https://arxiv.org/format/2409.13285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> LiSenNet: Lightweight Sub-band and Dual-Path Modeling for Real-Time Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haoyin Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+C">Cunhang Fan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yeping Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Peiqi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13285v1-abstract-short" style="display: inline;"> Speech enhancement (SE) aims to extract the clean waveform from noise-contaminated measurements to improve the speech quality and intelligibility. Although learning-based methods can perform much better than traditional counterparts, the large computational complexity and model size heavily limit the deployment on latency-sensitive and low-resource edge devices. In this work, we propose a lightwei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13285v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13285v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13285v1-abstract-full" style="display: none;"> Speech enhancement (SE) aims to extract the clean waveform from noise-contaminated measurements to improve the speech quality and intelligibility. Although learning-based methods can perform much better than traditional counterparts, the large computational complexity and model size heavily limit the deployment on latency-sensitive and low-resource edge devices. In this work, we propose a lightweight SE network (LiSenNet) for real-time applications. We design sub-band downsampling and upsampling blocks and a dual-path recurrent module to capture band-aware features and time-frequency patterns, respectively. A noise detector is developed to detect noisy regions in order to perform SE adaptively and save computational costs. Compared to recent higher-resource-dependent baseline models, the proposed LiSenNet can achieve a competitive performance with only 37k parameters (half of the state-of-the-art model) and 56M multiply-accumulate (MAC) operations per second. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13285v1-abstract-full').style.display = 'none'; document.getElementById('2409.13285v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, submitted to 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11854">arXiv:2409.11854</a> <span> [<a href="https://arxiv.org/pdf/2409.11854">pdf</a>, <a href="https://arxiv.org/ps/2409.11854">ps</a>, <a href="https://arxiv.org/format/2409.11854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Physically-Based Photometric Bundle Adjustment in Non-Lambertian Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lei Cheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junpeng Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haodong Yan</a>, <a href="/search/cs?searchtype=author&query=Gladkova%2C+M">Mariia Gladkova</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tianyu Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yun-Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Cremers%2C+D">Daniel Cremers</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11854v1-abstract-short" style="display: inline;"> Photometric bundle adjustment (PBA) is widely used in estimating the camera pose and 3D geometry by assuming a Lambertian world. However, the assumption of photometric consistency is often violated since the non-diffuse reflection is common in real-world environments. The photometric inconsistency significantly affects the reliability of existing PBA methods. To solve this problem, we propose a no… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11854v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11854v1-abstract-full" style="display: none;"> Photometric bundle adjustment (PBA) is widely used in estimating the camera pose and 3D geometry by assuming a Lambertian world. However, the assumption of photometric consistency is often violated since the non-diffuse reflection is common in real-world environments. The photometric inconsistency significantly affects the reliability of existing PBA methods. To solve this problem, we propose a novel physically-based PBA method. Specifically, we introduce the physically-based weights regarding material, illumination, and light path. These weights distinguish the pixel pairs with different levels of photometric inconsistency. We also design corresponding models for material estimation based on sequential images and illumination estimation based on point clouds. In addition, we establish the first SLAM-related dataset of non-Lambertian scenes with complete ground truth of illumination and material. Extensive experiments demonstrated that our PBA method outperforms existing approaches in accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11854v1-abstract-full').style.display = 'none'; document.getElementById('2409.11854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11018">arXiv:2409.11018</a> <span> [<a href="https://arxiv.org/pdf/2409.11018">pdf</a>, <a href="https://arxiv.org/format/2409.11018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Mamba: Boosting a LiDAR 3D Sparse Detector by Using Cross-Model Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runkai Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiagen Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qingsong Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Songhao Zhu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">HuaiCheng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11018v1-abstract-short" style="display: inline;"> The LiDAR-based 3D object detector that strikes a balance between accuracy and speed is crucial for achieving real-time perception in autonomous driving and robotic navigation systems. To enhance the accuracy of point cloud detection, integrating global context for visual understanding improves the point clouds ability to grasp overall spatial information. However, many existing LiDAR detection mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11018v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11018v1-abstract-full" style="display: none;"> The LiDAR-based 3D object detector that strikes a balance between accuracy and speed is crucial for achieving real-time perception in autonomous driving and robotic navigation systems. To enhance the accuracy of point cloud detection, integrating global context for visual understanding improves the point clouds ability to grasp overall spatial information. However, many existing LiDAR detection models depend on intricate feature transformation and extraction processes, leading to poor real-time performance and high resource consumption, which limits their practical effectiveness. In this work, we propose a Faster LiDAR 3D object detection framework, called FASD, which implements heterogeneous model distillation by adaptively uniform cross-model voxel features. We aim to distill the transformer's capacity for high-performance sequence modeling into Mamba models with low FLOPs, achieving a significant improvement in accuracy through knowledge transfer. Specifically, Dynamic Voxel Group and Adaptive Attention strategies are integrated into the sparse backbone, creating a robust teacher model with scale-adaptive attention for effective global visual context modeling. Following feature alignment with the Adapter, we transfer knowledge from the Transformer to the Mamba through latent space feature supervision and span-head distillation, resulting in improved performance and an efficient student model. We evaluated the framework on the Waymo and nuScenes datasets, achieving a 4x reduction in resource consumption and a 1-2\% performance improvement over the current SoTA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11018v1-abstract-full').style.display = 'none'; document.getElementById('2409.11018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09253">arXiv:2409.09253</a> <span> [<a href="https://arxiv.org/pdf/2409.09253">pdf</a>, <a href="https://arxiv.org/format/2409.09253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Unleash LLMs Potential for Recommendation by Coordinating Twin-Tower Dynamic Semantic Token Generator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jun Yin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhengxin Zeng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingzheng Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weihao Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianjin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruochen Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+A">Allen Sun</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+D">Denvy Deng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Feng Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shirui Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Senzhang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09253v1-abstract-short" style="display: inline;"> Owing to the unprecedented capability in semantic understanding and logical reasoning, the pre-trained large language models (LLMs) have shown fantastic potential in developing the next-generation recommender systems (RSs). However, the static index paradigm adopted by current methods greatly restricts the utilization of LLMs capacity for recommendation, leading to not only the insufficient alignm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09253v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09253v1-abstract-full" style="display: none;"> Owing to the unprecedented capability in semantic understanding and logical reasoning, the pre-trained large language models (LLMs) have shown fantastic potential in developing the next-generation recommender systems (RSs). However, the static index paradigm adopted by current methods greatly restricts the utilization of LLMs capacity for recommendation, leading to not only the insufficient alignment between semantic and collaborative knowledge, but also the neglect of high-order user-item interaction patterns. In this paper, we propose Twin-Tower Dynamic Semantic Recommender (TTDS), the first generative RS which adopts dynamic semantic index paradigm, targeting at resolving the above problems simultaneously. To be more specific, we for the first time contrive a dynamic knowledge fusion framework which integrates a twin-tower semantic token generator into the LLM-based recommender, hierarchically allocating meaningful semantic index for items and users, and accordingly predicting the semantic index of target item. Furthermore, a dual-modality variational auto-encoder is proposed to facilitate multi-grained alignment between semantic and collaborative knowledge. Eventually, a series of novel tuning tasks specially customized for capturing high-order user-item interaction patterns are proposed to take advantages of user historical behavior. Extensive experiments across three public datasets demonstrate the superiority of the proposed methodology in developing LLM-based generative RSs. The proposed TTDS recommender achieves an average improvement of 19.41% in Hit-Rate and 20.84% in NDCG metric, compared with the leading baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09253v1-abstract-full').style.display = 'none'; document.getElementById('2409.09253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05587">arXiv:2409.05587</a> <span> [<a href="https://arxiv.org/pdf/2409.05587">pdf</a>, <a href="https://arxiv.org/format/2409.05587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DSDFormer: An Innovative Transformer-Mamba Framework for Robust High-Precision Driver Distraction Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junzhou Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zirui Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jing Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ronghui Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuemiao Xu</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+B">Bin Sheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05587v2-abstract-short" style="display: inline;"> Driver distraction remains a leading cause of traffic accidents, posing a critical threat to road safety globally. As intelligent transportation systems evolve, accurate and real-time identification of driver distraction has become essential. However, existing methods struggle to capture both global contextual and fine-grained local features while contending with noisy labels in training datasets.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05587v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05587v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05587v2-abstract-full" style="display: none;"> Driver distraction remains a leading cause of traffic accidents, posing a critical threat to road safety globally. As intelligent transportation systems evolve, accurate and real-time identification of driver distraction has become essential. However, existing methods struggle to capture both global contextual and fine-grained local features while contending with noisy labels in training datasets. To address these challenges, we propose DSDFormer, a novel framework that integrates the strengths of Transformer and Mamba architectures through a Dual State Domain Attention (DSDA) mechanism, enabling a balance between long-range dependencies and detailed feature extraction for robust driver behavior recognition. Additionally, we introduce Temporal Reasoning Confident Learning (TRCL), an unsupervised approach that refines noisy labels by leveraging spatiotemporal correlations in video sequences. Our model achieves state-of-the-art performance on the AUC-V1, AUC-V2, and 100-Driver datasets and demonstrates real-time processing efficiency on the NVIDIA Jetson AGX Orin platform. Extensive experimental results confirm that DSDFormer and TRCL significantly improve both the accuracy and robustness of driver distraction detection, offering a scalable solution to enhance road safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05587v2-abstract-full').style.display = 'none'; document.getElementById('2409.05587v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04390">arXiv:2409.04390</a> <span> [<a href="https://arxiv.org/pdf/2409.04390">pdf</a>, <a href="https://arxiv.org/format/2409.04390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Future Does Matter: Boosting 3D Object Detection with Temporal Motion Estimation in Point Cloud Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Rui Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Runkai Zhao</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+C">Cong Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">HuaiCheng Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04390v1-abstract-short" style="display: inline;"> Accurate and robust LiDAR 3D object detection is essential for comprehensive scene understanding in autonomous driving. Despite its importance, LiDAR detection performance is limited by inherent constraints of point cloud data, particularly under conditions of extended distances and occlusions. Recently, temporal aggregation has been proven to significantly enhance detection accuracy by fusing mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04390v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04390v1-abstract-full" style="display: none;"> Accurate and robust LiDAR 3D object detection is essential for comprehensive scene understanding in autonomous driving. Despite its importance, LiDAR detection performance is limited by inherent constraints of point cloud data, particularly under conditions of extended distances and occlusions. Recently, temporal aggregation has been proven to significantly enhance detection accuracy by fusing multi-frame viewpoint information and enriching the spatial representation of objects. In this work, we introduce a novel LiDAR 3D object detection framework, namely LiSTM, to facilitate spatial-temporal feature learning with cross-frame motion forecasting information. We aim to improve the spatial-temporal interpretation capabilities of the LiDAR detector by incorporating a dynamic prior, generated from a non-learnable motion estimation model. Specifically, Motion-Guided Feature Aggregation (MGFA) is proposed to utilize the object trajectory from previous and future motion states to model spatial-temporal correlations into gaussian heatmap over a driving sequence. This motion-based heatmap then guides the temporal feature fusion, enriching the proposed object features. Moreover, we design a Dual Correlation Weighting Module (DCWM) that effectively facilitates the interaction between past and prospective frames through scene- and channel-wise feature abstraction. In the end, a cascade cross-attention-based decoder is employed to refine the 3D prediction. We have conducted experiments on the Waymo and nuScenes datasets to demonstrate that the proposed framework achieves superior 3D detection performance with effective spatial-temporal feature learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04390v1-abstract-full').style.display = 'none'; document.getElementById('2409.04390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03189">arXiv:2409.03189</a> <span> [<a href="https://arxiv.org/pdf/2409.03189">pdf</a>, <a href="https://arxiv.org/ps/2409.03189">ps</a>, <a href="https://arxiv.org/format/2409.03189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> A note on the differential spectrum of the Ness-Helleseth function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ren%2C+K">Ketong Ren</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+M">Maosheng Xiong</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haode Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03189v1-abstract-short" style="display: inline;"> Let $n\geqslant3$ be an odd integer and $u$ an element in the finite field $\gf_{3^n}$. The Ness-Helleseth function is the binomial $f_u(x)=ux^{d_1}+x^{d_2}$ over $\gf_{3^n}$, where $d_1=\frac{3^n-1}{2}-1$ and $d_2=3^n-2$. In 2007, Ness and Helleseth showed that $f_u$ is an APN function when $蠂(u+1)=蠂(u-1)=蠂(u)$, is differentially $3$-uniform when $蠂(u+1)=蠂(u-1)\neq蠂(u)$, and has differential unif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03189v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03189v1-abstract-full" style="display: none;"> Let $n\geqslant3$ be an odd integer and $u$ an element in the finite field $\gf_{3^n}$. The Ness-Helleseth function is the binomial $f_u(x)=ux^{d_1}+x^{d_2}$ over $\gf_{3^n}$, where $d_1=\frac{3^n-1}{2}-1$ and $d_2=3^n-2$. In 2007, Ness and Helleseth showed that $f_u$ is an APN function when $蠂(u+1)=蠂(u-1)=蠂(u)$, is differentially $3$-uniform when $蠂(u+1)=蠂(u-1)\neq蠂(u)$, and has differential uniformity at most 4 if $ 蠂(u+1)\neq蠂(u-1)$ and $u\notin\gf_3$. Here $蠂(\cdot)$ denotes the quadratic character on $\gf_{3^n}$. Recently, Xia et al. determined the differential uniformity of $f_u$ for all $u$ and computed the differential spectrum of $f_u$ for $u$ satisfying $蠂(u+1)=蠂(u-1)$ or $u\in\gf_3$. The remaining problem is the differential spectrum of $f_u$ with $蠂(u+1)\neq蠂(u-1)$ and $u\notin\gf_3$. In this paper, we fill in the gap. By studying differential equations arising from the Ness-Helleseth function $f_u$ more carefully, we express the differential spectrum of $f_u$ for such $u$ in terms of two quadratic character sums. This complements the previous work of Xia et al. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03189v1-abstract-full').style.display = 'none'; document.getElementById('2409.03189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01893">arXiv:2409.01893</a> <span> [<a href="https://arxiv.org/pdf/2409.01893">pdf</a>, <a href="https://arxiv.org/format/2409.01893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> What are the Essential Factors in Crafting Effective Long Context Multi-Hop Instruction Datasets? Insights and Best Practices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qiguang Chen</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Haijun Lv</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Y">Yicheng Zou</a>, <a href="/search/cs?searchtype=author&query=Che%2C+W">Wanxiang Che</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hang Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01893v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) with extended context windows have significantly improved tasks such as information extraction, question answering, and complex planning scenarios. In order to achieve success in long context tasks, a large amount of work has been done to enhance the long context capabilities of the model through synthetic data. Existing methods typically utilize… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01893v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01893v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) with extended context windows have significantly improved tasks such as information extraction, question answering, and complex planning scenarios. In order to achieve success in long context tasks, a large amount of work has been done to enhance the long context capabilities of the model through synthetic data. Existing methods typically utilize the Self-Instruct framework to generate instruction tuning data for better long context capability improvement. However, our preliminary experiments indicate that less than 35% of generated samples are multi-hop, and more than 40% exhibit poor quality, limiting comprehensive understanding and further research. To improve the quality of synthetic data, we propose the Multi-agent Interactive Multi-hop Generation (MIMG) framework, incorporating a Quality Verification Agent, a Single-hop Question Generation Agent, a Multiple Question Sampling Strategy, and a Multi-hop Question Merger Agent. This framework improves the data quality, with the proportion of high-quality, multi-hop, and diverse data exceeding 85%. Furthermore, we systematically investigate strategies for document selection, question merging, and validation techniques through extensive experiments across various models. Our findings show that our synthetic high-quality long-context instruction data significantly enhances model performance, even surpassing models trained on larger amounts of human-annotated data. Our code is available at: https://github.com/WowCZ/LongMIT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01893v1-abstract-full').style.display = 'none'; document.getElementById('2409.01893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14997">arXiv:2408.14997</a> <span> [<a href="https://arxiv.org/pdf/2408.14997">pdf</a>, <a href="https://arxiv.org/format/2408.14997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Depth Restoration of Hand-Held Transparent Objects for Human-to-Robot Handover </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+R">Ran Yu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haixin Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shoujie Li</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Huang Yan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Ziwu Song</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Wenbo Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14997v2-abstract-short" style="display: inline;"> Transparent objects are common in daily life, while their optical properties pose challenges for RGB-D cameras to capture accurate depth information. This issue is further amplified when these objects are hand-held, as hand occlusions further complicate depth estimation. For assistant robots, however, accurately perceiving hand-held transparent objects is critical to effective human-robot interact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14997v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14997v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14997v2-abstract-full" style="display: none;"> Transparent objects are common in daily life, while their optical properties pose challenges for RGB-D cameras to capture accurate depth information. This issue is further amplified when these objects are hand-held, as hand occlusions further complicate depth estimation. For assistant robots, however, accurately perceiving hand-held transparent objects is critical to effective human-robot interaction. This paper presents a Hand-Aware Depth Restoration (HADR) method based on creating an implicit neural representation function from a single RGB-D image. The proposed method utilizes hand posture as an important guidance to leverage semantic and geometric information of hand-object interaction. To train and evaluate the proposed method, we create a high-fidelity synthetic dataset named TransHand-14K with a real-to-sim data generation scheme. Experiments show that our method has better performance and generalization ability compared with existing methods. We further develop a real-world human-to-robot handover system based on HADR, demonstrating its potential in human-robot interaction applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14997v2-abstract-full').style.display = 'none'; document.getElementById('2408.14997v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 7 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09157">arXiv:2408.09157</a> <span> [<a href="https://arxiv.org/pdf/2408.09157">pdf</a>, <a href="https://arxiv.org/format/2408.09157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> On the KL-Divergence-based Robust Satisficing Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haojie Yan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+M">Minglong Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiayi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09157v1-abstract-short" style="display: inline;"> Empirical risk minimization, a cornerstone in machine learning, is often hindered by the Optimizer's Curse stemming from discrepancies between the empirical and true data-generating distributions.To address this challenge, the robust satisficing framework has emerged recently to mitigate ambiguity in the true distribution. Distinguished by its interpretable hyperparameter and enhanced performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09157v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09157v1-abstract-full" style="display: none;"> Empirical risk minimization, a cornerstone in machine learning, is often hindered by the Optimizer's Curse stemming from discrepancies between the empirical and true data-generating distributions.To address this challenge, the robust satisficing framework has emerged recently to mitigate ambiguity in the true distribution. Distinguished by its interpretable hyperparameter and enhanced performance guarantees, this approach has attracted increasing attention from academia. However, its applicability in tackling general machine learning problems, notably deep neural networks, remains largely unexplored due to the computational challenges in solving this model efficiently across general loss functions. In this study, we delve into the Kullback Leibler divergence based robust satisficing model under a general loss function, presenting analytical interpretations, diverse performance guarantees, efficient and stable numerical methods, convergence analysis, and an extension tailored for hierarchical data structures. Through extensive numerical experiments across three distinct machine learning tasks, we demonstrate the superior performance of our model compared to state-of-the-art benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09157v1-abstract-full').style.display = 'none'; document.getElementById('2408.09157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03152">arXiv:2408.03152</a> <span> [<a href="https://arxiv.org/pdf/2408.03152">pdf</a>, <a href="https://arxiv.org/format/2408.03152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TSC: A Simple Two-Sided Constraint against Over-Smoothing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+F">Furong Peng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xuan Lu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yuhua Qian</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongren Yan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chao Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03152v1-abstract-short" style="display: inline;"> Graph Convolutional Neural Network (GCN), a widely adopted method for analyzing relational data, enhances node discriminability through the aggregation of neighboring information. Usually, stacking multiple layers can improve the performance of GCN by leveraging information from high-order neighbors. However, the increase of the network depth will induce the over-smoothing problem, which can be at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03152v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03152v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03152v1-abstract-full" style="display: none;"> Graph Convolutional Neural Network (GCN), a widely adopted method for analyzing relational data, enhances node discriminability through the aggregation of neighboring information. Usually, stacking multiple layers can improve the performance of GCN by leveraging information from high-order neighbors. However, the increase of the network depth will induce the over-smoothing problem, which can be attributed to the quality and quantity of neighbors changing: (a) neighbor quality, node's neighbors become overlapping in high order, leading to aggregated information becoming indistinguishable, (b) neighbor quantity, the exponentially growing aggregated neighbors submerges the node's initial feature by recursively aggregating operations. Current solutions mainly focus on one of the above causes and seldom consider both at once. Aiming at tackling both causes of over-smoothing in one shot, we introduce a simple Two-Sided Constraint (TSC) for GCNs, comprising two straightforward yet potent techniques: random masking and contrastive constraint. The random masking acts on the representation matrix's columns to regulate the degree of information aggregation from neighbors, thus preventing the convergence of node representations. Meanwhile, the contrastive constraint, applied to the representation matrix's rows, enhances the discriminability of the nodes. Designed as a plug-in module, TSC can be easily coupled with GCN or SGC architectures. Experimental analyses on diverse real-world graph datasets verify that our approach markedly reduces the convergence of node's representation and the performance degradation in deeper GCN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03152v1-abstract-full').style.display = 'none'; document.getElementById('2408.03152v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept by KDD2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03084">arXiv:2408.03084</a> <span> [<a href="https://arxiv.org/pdf/2408.03084">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Research on Autonomous Driving Decision-making Strategies based Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixiang Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Changsong Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyu Wang</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+S">Shi Bo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+M">Minheng Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03084v1-abstract-short" style="display: inline;"> The behavior decision-making subsystem is a key component of the autonomous driving system, which reflects the decision-making ability of the vehicle and the driver, and is an important symbol of the high-level intelligence of the vehicle. However, the existing rule-based decision-making schemes are limited by the prior knowledge of designers, and it is difficult to cope with complex and changeabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03084v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03084v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03084v1-abstract-full" style="display: none;"> The behavior decision-making subsystem is a key component of the autonomous driving system, which reflects the decision-making ability of the vehicle and the driver, and is an important symbol of the high-level intelligence of the vehicle. However, the existing rule-based decision-making schemes are limited by the prior knowledge of designers, and it is difficult to cope with complex and changeable traffic scenarios. In this work, an advanced deep reinforcement learning model is adopted, which can autonomously learn and optimize driving strategies in a complex and changeable traffic environment by modeling the driving decision-making process as a reinforcement learning problem. Specifically, we used Deep Q-Network (DQN) and Proximal Policy Optimization (PPO) for comparative experiments. DQN guides the agent to choose the best action by approximating the state-action value function, while PPO improves the decision-making quality by optimizing the policy function. We also introduce improvements in the design of the reward function to promote the robustness and adaptability of the model in real-world driving situations. Experimental results show that the decision-making strategy based on deep reinforcement learning has better performance than the traditional rule-based method in a variety of driving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03084v1-abstract-full').style.display = 'none'; document.getElementById('2408.03084v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02455">arXiv:2408.02455</a> <span> [<a href="https://arxiv.org/pdf/2408.02455">pdf</a>, <a href="https://arxiv.org/format/2408.02455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Surprisingly Efficient Representation for Multi-Finger Grasping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hengxu Yan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hao-Shu Fang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Cewu Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02455v1-abstract-short" style="display: inline;"> The problem of grasping objects using a multi-finger hand has received significant attention in recent years. However, it remains challenging to handle a large number of unfamiliar objects in real and cluttered environments. In this work, we propose a representation that can be effectively mapped to the multi-finger grasp space. Based on this representation, we develop a simple decision model that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02455v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02455v1-abstract-full" style="display: none;"> The problem of grasping objects using a multi-finger hand has received significant attention in recent years. However, it remains challenging to handle a large number of unfamiliar objects in real and cluttered environments. In this work, we propose a representation that can be effectively mapped to the multi-finger grasp space. Based on this representation, we develop a simple decision model that generates accurate grasp quality scores for different multi-finger grasp poses using only hundreds to thousands of training samples. We demonstrate that our representation performs well on a real robot and achieves a success rate of 78.64% after training with only 500 real-world grasp attempts and 87% with 4500 grasp attempts. Additionally, we achieve a success rate of 84.51% in a dynamic human-robot handover scenario using a multi-finger hand. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02455v1-abstract-full').style.display = 'none'; document.getElementById('2408.02455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at International Conference on Robotics and Automation (ICRA) 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01779">arXiv:2408.01779</a> <span> [<a href="https://arxiv.org/pdf/2408.01779">pdf</a>, <a href="https://arxiv.org/format/2408.01779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MathLearner: A Large Language Model Agent Framework for Learning to Solve Mathematical Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wenbei Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Donglin Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haoran Yan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjie Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zongyang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01779v1-abstract-short" style="display: inline;"> With the development of artificial intelligence (AI), large language models (LLM) are widely used in many fields. However, the reasoning ability of LLM is still very limited when it comes to mathematical reasoning. Mathematics plays an important role in all aspects of human society and is a technical guarantee in the fields of healthcare, transport and aerospace, for this reason, the development o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01779v1-abstract-full').style.display = 'inline'; document.getElementById('2408.01779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01779v1-abstract-full" style="display: none;"> With the development of artificial intelligence (AI), large language models (LLM) are widely used in many fields. However, the reasoning ability of LLM is still very limited when it comes to mathematical reasoning. Mathematics plays an important role in all aspects of human society and is a technical guarantee in the fields of healthcare, transport and aerospace, for this reason, the development of AI big language models in the field of mathematics has great potential significance. To improve the mathematical reasoning ability of large language models, we proposed an agent framework for learning to solve mathematical problems based on inductive reasoning. By emulating the human learning process of generalization of learned information and effective application of previous knowledge in new reasoning tasks, this framework has great performance in the mathematical reasoning process. It improves global accuracy over the baseline method (chain-of-thought) by 20.96% and solves 17.54% of the mathematical problems that the baseline cannot solve. Benefiting from the efficient RETRIEVAL method, our model improves the ability of large language models to efficiently use external knowledge, i.e., the mathematical computation of the model can be based on written procedures. In education, our model can be used as a personalised learning aid, thus reducing the inequality of educational resources. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01779v1-abstract-full').style.display = 'none'; document.getElementById('2408.01779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21323">arXiv:2407.21323</a> <span> [<a href="https://arxiv.org/pdf/2407.21323">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> STANet: A Novel Spatio-Temporal Aggregation Network for Depression Classification with Small and Unbalanced FMRI Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongjie Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaile Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+R">Ran Tao</a>, <a href="/search/cs?searchtype=author&query=Siok%2C+W+T">Wai Ting Siok</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nizhuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21323v1-abstract-short" style="display: inline;"> Accurate diagnosis of depression is crucial for timely implementation of optimal treatments, preventing complications and reducing the risk of suicide. Traditional methods rely on self-report questionnaires and clinical assessment, lacking objective biomarkers. Combining fMRI with artificial intelligence can enhance depression diagnosis by integrating neuroimaging indicators. However, the specific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21323v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21323v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21323v1-abstract-full" style="display: none;"> Accurate diagnosis of depression is crucial for timely implementation of optimal treatments, preventing complications and reducing the risk of suicide. Traditional methods rely on self-report questionnaires and clinical assessment, lacking objective biomarkers. Combining fMRI with artificial intelligence can enhance depression diagnosis by integrating neuroimaging indicators. However, the specificity of fMRI acquisition for depression often results in unbalanced and small datasets, challenging the sensitivity and accuracy of classification models. In this study, we propose the Spatio-Temporal Aggregation Network (STANet) for diagnosing depression by integrating CNN and RNN to capture both temporal and spatial features of brain activity. STANet comprises the following steps:(1) Aggregate spatio-temporal information via ICA. (2) Utilize multi-scale deep convolution to capture detailed features. (3) Balance data using the SMOTE to generate new samples for minority classes. (4) Employ the AFGRU classifier, which combines Fourier transformation with GRU, to capture long-term dependencies, with an adaptive weight assignment mechanism to enhance model generalization. The experimental results demonstrate that STANet achieves superior depression diagnostic performance with 82.38% accuracy and a 90.72% AUC. The STFA module enhances classification by capturing deeper features at multiple scales. The AFGRU classifier, with adaptive weights and stacked GRU, attains higher accuracy and AUC. SMOTE outperforms other oversampling methods. Additionally, spatio-temporal aggregated features achieve better performance compared to using only temporal or spatial features. STANet outperforms traditional or deep learning classifiers, and functional connectivity-based classifiers, as demonstrated by ten-fold cross-validation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21323v1-abstract-full').style.display = 'none'; document.getElementById('2407.21323v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20018">arXiv:2407.20018</a> <span> [<a href="https://arxiv.org/pdf/2407.20018">pdf</a>, <a href="https://arxiv.org/format/2407.20018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Efficient Training of Large Language Models on Distributed Infrastructures: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jiangfei Duan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zerui Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+L">Lijuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+W">Wenwen Qu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghao Hu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoteng Wang</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Q">Qizhen Weng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hang Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yonggang Wen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xin Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+P">Peng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20018v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) like GPT and LLaMA are revolutionizing the AI industry with their sophisticated capabilities. Training these models requires vast GPU clusters and significant computing time, posing major challenges in terms of scalability, efficiency, and reliability. This survey explores recent advancements in training systems for LLMs, including innovations in training infrastructur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20018v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20018v1-abstract-full" style="display: none;"> Large Language Models (LLMs) like GPT and LLaMA are revolutionizing the AI industry with their sophisticated capabilities. Training these models requires vast GPU clusters and significant computing time, posing major challenges in terms of scalability, efficiency, and reliability. This survey explores recent advancements in training systems for LLMs, including innovations in training infrastructure with AI accelerators, networking, storage, and scheduling. Additionally, the survey covers parallelism strategies, as well as optimizations for computation, communication, and memory in distributed LLM training. It also includes approaches of maintaining system reliability over extended training periods. By examining current innovations and future directions, this survey aims to provide valuable insights towards improving LLM training systems and tackling ongoing challenges. Furthermore, traditional digital circuit-based computing systems face significant constraints in meeting the computational demands of LLMs, highlighting the need for innovative solutions such as optical computing and optical networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20018v1-abstract-full').style.display = 'none'; document.getElementById('2407.20018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18492">arXiv:2407.18492</a> <span> [<a href="https://arxiv.org/pdf/2407.18492">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Neural Modulation Alteration to Positive and Negative Emotions in Depressed Patients: Insights from fMRI Using Positive/Negative Emotion Atlas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yu Feng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingying Wang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongjie Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaile Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+R">Ran Tao</a>, <a href="/search/cs?searchtype=author&query=Siok%2C+W+T">Wai Ting Siok</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nizhuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18492v1-abstract-short" style="display: inline;"> Background: Although it has been noticed that depressed patients show differences in processing emotions, the precise neural modulation mechanisms of positive and negative emotions remain elusive. FMRI is a cutting-edge medical imaging technology renowned for its high spatial resolution and dynamic temporal information, making it particularly suitable for the neural dynamics of depression research… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18492v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18492v1-abstract-full" style="display: none;"> Background: Although it has been noticed that depressed patients show differences in processing emotions, the precise neural modulation mechanisms of positive and negative emotions remain elusive. FMRI is a cutting-edge medical imaging technology renowned for its high spatial resolution and dynamic temporal information, making it particularly suitable for the neural dynamics of depression research. Methods: To address this gap, our study firstly leveraged fMRI to delineate activated regions associated with positive and negative emotions in healthy individuals, resulting in the creation of positive emotion atlas (PEA) and negative emotion atlas (NEA). Subsequently, we examined neuroimaging changes in depression patients using these atlases and evaluated their diagnostic performance based on machine learning. Results: Our findings demonstrate that the classification accuracy of depressed patients based on PEA and NEA exceeded 0.70, a notable improvement compared to the whole-brain atlases. Furthermore, ALFF analysis unveiled significant differences between depressed patients and healthy controls in eight functional clusters during the NEA, focusing on the left cuneus, cingulate gyrus, and superior parietal lobule. In contrast, the PEA revealed more pronounced differences across fifteen clusters, involving the right fusiform gyrus, parahippocampal gyrus, and inferior parietal lobule. Limitations: Due to the limited sample size and subtypes of depressed patients, the efficacy may need further validation in future. Conclusions: These findings emphasize the complex interplay between emotion modulation and depression, showcasing significant alterations in both PEA and NEA among depression patients. This research enhances our understanding of emotion modulation in depression, with implications for diagnosis and treatment evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18492v1-abstract-full').style.display = 'none'; document.getElementById('2407.18492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15176">arXiv:2407.15176</a> <span> [<a href="https://arxiv.org/pdf/2407.15176">pdf</a>, <a href="https://arxiv.org/format/2407.15176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ReAttention: Training-Free Infinite Context with Finite Attention Scope </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoran Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruixiao Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhigeng Liu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yuerong Song</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+K">Kai Lv</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hang Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linlin Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qun Liu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15176v2-abstract-short" style="display: inline;"> The long-context capability of the Large Language Models (LLM) has made significant breakthroughs, but the maximum supported context length remains a critical bottleneck limiting their practical applications. The constraint of context length in LLMs arises from the self-attention mechanism, which cannot effectively and efficiently capture the semantic relationships within infinitely long contexts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15176v2-abstract-full').style.display = 'inline'; document.getElementById('2407.15176v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15176v2-abstract-full" style="display: none;"> The long-context capability of the Large Language Models (LLM) has made significant breakthroughs, but the maximum supported context length remains a critical bottleneck limiting their practical applications. The constraint of context length in LLMs arises from the self-attention mechanism, which cannot effectively and efficiently capture the semantic relationships within infinitely long contexts via the limited pre-trained positional information and attention scope. In this work, we propose \textbf{ReAttention}, a training-free approach enabling LLM based on the self-attention mechanism to support an infinite context with a finite attention scope under sufficient memory resources. ReAttention performs the position-agnostic top-$k$ attention before the ordinary position-aware self-attention, freeing LLMs from the length extrapolation issue. We validate the performance of ReAttention on the LongBench, L-Eval, and InfiniteBench and demonstrate that it is on par with traditional methods. Furthermore, we also apply ReAttention on mainstream LLMs, including LLaMA3.1-8B and Mistral-v0.3-7B, enabling them to support context lengths of at least 1M and even expanding the context length of LLaMA3.2-3B-chat by 128$\times$ to 4M without any further training in Needle-In-A-Haystack tests. We also improve the efficiency of ReAttention with Triton and achieve an efficient extrapolation without additional overhead. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15176v2-abstract-full').style.display = 'none'; document.getElementById('2407.15176v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14850">arXiv:2407.14850</a> <span> [<a href="https://arxiv.org/pdf/2407.14850">pdf</a>, <a href="https://arxiv.org/format/2407.14850">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Tale of Single-channel Electroencephalogram: Devices, Datasets, Signal Processing, Applications, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yueyang Li</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+W">Wenhao Dong</a>, <a href="/search/cs?searchtype=author&query=Han%2C+D">Di Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hongjie Yan</a>, <a href="/search/cs?searchtype=author&query=Siok%2C+W+T">Wai Ting Siok</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Nizhuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14850v1-abstract-short" style="display: inline;"> Single-channel electroencephalogram (EEG) is a cost-effective, comfortable, and non-invasive method for monitoring brain activity, widely adopted by researchers, consumers, and clinicians. The increasing number and proportion of articles on single-channel EEG underscore its growing potential. This paper provides a comprehensive review of single-channel EEG, focusing on development trends, devices,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14850v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14850v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14850v1-abstract-full" style="display: none;"> Single-channel electroencephalogram (EEG) is a cost-effective, comfortable, and non-invasive method for monitoring brain activity, widely adopted by researchers, consumers, and clinicians. The increasing number and proportion of articles on single-channel EEG underscore its growing potential. This paper provides a comprehensive review of single-channel EEG, focusing on development trends, devices, datasets, signal processing methods, recent applications, and future directions. Definitions of bipolar and unipolar configurations in single-channel EEG are clarified to guide future advancements. Applications mainly span sleep staging, emotion recognition, educational research, and clinical diagnosis. Ongoing advancements of single-channel EEG in AI-based EEG generation techniques suggest potential parity or superiority over multichannel EEG performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14850v1-abstract-full').style.display = 'none'; document.getElementById('2407.14850v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13211">arXiv:2407.13211</a> <span> [<a href="https://arxiv.org/pdf/2407.13211">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Research on Image Super-Resolution Reconstruction Mechanism based on Convolutional Neural Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixiang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhengjia Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhuoyue Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhizhong Wu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+R">Ranran Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13211v2-abstract-short" style="display: inline;"> Super-resolution reconstruction techniques entail the utilization of software algorithms to transform one or more sets of low-resolution images captured from the same scene into high-resolution images. In recent years, considerable advancement has been observed in the domain of single-image super-resolution algorithms, particularly those based on deep learning techniques. Nevertheless, the extract… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13211v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13211v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13211v2-abstract-full" style="display: none;"> Super-resolution reconstruction techniques entail the utilization of software algorithms to transform one or more sets of low-resolution images captured from the same scene into high-resolution images. In recent years, considerable advancement has been observed in the domain of single-image super-resolution algorithms, particularly those based on deep learning techniques. Nevertheless, the extraction of image features and nonlinear mapping methods in the reconstruction process remain challenging for existing algorithms. These issues result in the network architecture being unable to effectively utilize the diverse range of information at different levels. The loss of high-frequency details is significant, and the final reconstructed image features are overly smooth, with a lack of fine texture details. This negatively impacts the subjective visual quality of the image. The objective is to recover high-quality, high-resolution images from low-resolution images. In this work, an enhanced deep convolutional neural network model is employed, comprising multiple convolutional layers, each of which is configured with specific filters and activation functions to effectively capture the diverse features of the image. Furthermore, a residual learning strategy is employed to accelerate training and enhance the convergence of the network, while sub-pixel convolutional layers are utilized to refine the high-frequency details and textures of the image. The experimental analysis demonstrates the superior performance of the proposed model on multiple public datasets when compared with the traditional bicubic interpolation method and several other learning-based super-resolution methods. Furthermore, it proves the model's efficacy in maintaining image edges and textures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13211v2-abstract-full').style.display = 'none'; document.getElementById('2407.13211v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12504">arXiv:2407.12504</a> <span> [<a href="https://arxiv.org/pdf/2407.12504">pdf</a>, <a href="https://arxiv.org/format/2407.12504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Case2Code: Learning Inductive Reasoning with Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yunfan Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyang Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yichuan Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peiji Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+D">Demin Song</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Q">Qinyuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shimin Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaonan Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Q">Qipeng Guo</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hang Yan</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+X">Xipeng Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12504v1-abstract-short" style="display: inline;"> Complex reasoning is an impressive ability shown by large language models (LLMs). Most LLMs are skilled in deductive reasoning, such as chain-of-thought prompting or iterative tool-using to solve challenging tasks step-by-step. In this paper, we hope to focus on evaluating and teaching LLMs to conduct inductive reasoning, that is, LLMs are supposed to infer underlying rules by observing examples o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12504v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12504v1-abstract-full" style="display: none;"> Complex reasoning is an impressive ability shown by large language models (LLMs). Most LLMs are skilled in deductive reasoning, such as chain-of-thought prompting or iterative tool-using to solve challenging tasks step-by-step. In this paper, we hope to focus on evaluating and teaching LLMs to conduct inductive reasoning, that is, LLMs are supposed to infer underlying rules by observing examples or sequential transformations. However, collecting large-scale and diverse human-generated inductive data is challenging. We focus on data synthesis in the code domain and propose a \textbf{Case2Code} task by exploiting the expressiveness and correctness of programs. Specifically, we collect a diverse set of executable programs, synthesize input-output transformations for each program, and force LLMs to infer the underlying code implementations based on the synthetic I/O cases. We first evaluate representative LLMs on the synthesized Case2Code task and demonstrate that the Case-to-code induction is challenging for LLMs. Then, we synthesize large-scale Case2Code training samples to train LLMs to perform inductive reasoning. Experimental results show that such induction training benefits not only in distribution Case2Code performance but also enhances various coding abilities of trained LLMs, demonstrating the great potential of learning inductive reasoning via synthetic data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12504v1-abstract-full').style.display = 'none'; document.getElementById('2407.12504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09992">arXiv:2407.09992</a> <span> [<a href="https://arxiv.org/pdf/2407.09992">pdf</a>, <a href="https://arxiv.org/format/2407.09992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> TOP:A New Target-Audience Oriented Content Paraphrase Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B">Boda Lin</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiaxin Shi</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haolong Yan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Binghao Tang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xiaocheng Gong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Si Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09992v1-abstract-short" style="display: inline;"> Recommendation systems usually recommend the existing contents to different users. However, in comparison to static recommendation methods, a recommendation logic that dynamically adjusts based on user interest preferences may potentially attract a larger user base. Thus, we consider paraphrasing existing content based on the interests of the users to modify the content to better align with the pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09992v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09992v1-abstract-full" style="display: none;"> Recommendation systems usually recommend the existing contents to different users. However, in comparison to static recommendation methods, a recommendation logic that dynamically adjusts based on user interest preferences may potentially attract a larger user base. Thus, we consider paraphrasing existing content based on the interests of the users to modify the content to better align with the preferences of users. In this paper, we propose a new task named Target-Audience Oriented Content Paraphrase aims to generate more customized contents for the target audience. We introduce the task definition and the corresponding framework for the proposed task and the creation of the corresponding datasets. We utilize the Large Language Models (LLMs) and Large Vision Models (LVMs) to accomplish the base implementation of the TOP framework and provide the referential baseline results for the proposed task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09992v1-abstract-full').style.display = 'none'; document.getElementById('2407.09992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07503">arXiv:2407.07503</a> <span> [<a href="https://arxiv.org/pdf/2407.07503">pdf</a>, <a href="https://arxiv.org/format/2407.07503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Inter and Intra Prior Learning-based Hyperspectral Image Reconstruction Using Snapshot SWIR Metasurface </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Linqiang Li</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+J">Jinglei Hao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yongqiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Pan Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haofang Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Ziqin Zhang</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+S+G">Seong G. Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07503v3-abstract-short" style="display: inline;"> Shortwave-infrared(SWIR) spectral information, ranging from 1 渭m to 2.5渭m, overcomes the limitations of traditional color cameras in acquiring scene information. However, conventional SWIR hyperspectral imaging systems face challenges due to their bulky setups and low acquisition speeds. This work introduces a snapshot SWIR hyperspectral imaging system based on a metasurface filter and a correspon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07503v3-abstract-full').style.display = 'inline'; document.getElementById('2407.07503v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07503v3-abstract-full" style="display: none;"> Shortwave-infrared(SWIR) spectral information, ranging from 1 渭m to 2.5渭m, overcomes the limitations of traditional color cameras in acquiring scene information. However, conventional SWIR hyperspectral imaging systems face challenges due to their bulky setups and low acquisition speeds. This work introduces a snapshot SWIR hyperspectral imaging system based on a metasurface filter and a corresponding filter selection method to achieve the lowest correlation coefficient among these filters. This system offers the advantages of compact size and snapshot imaging. We propose a novel inter and intra prior learning unfolding framework to achieve high-quality SWIR hyperspectral image reconstruction, which bridges the gap between prior learning and cross-stage information interaction. Additionally, We design an adaptive feature transfer mechanism to adaptively transfer the contextual correlation of multi-scale encoder features to prevent detailed information loss in the decoder. Experiment results demonstrate that our method can reconstruct hyperspectral images with high speed and superior performance over existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07503v3-abstract-full').style.display = 'none'; document.getElementById('2407.07503v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages,9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05605">arXiv:2407.05605</a> <span> [<a href="https://arxiv.org/pdf/2407.05605">pdf</a>, <a href="https://arxiv.org/format/2407.05605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP43922.2022.9746163">10.1109/ICASSP43922.2022.9746163 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Two-Path GMM-ResNet and GMM-SENet for ASV Spoofing Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhenchun Lei</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hui Yan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Changhong Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Minglei Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yingen Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05605v1-abstract-short" style="display: inline;"> The automatic speaker verification system is sometimes vulnerable to various spoofing attacks. The 2-class Gaussian Mixture Model classifier for genuine and spoofed speech is usually used as the baseline for spoofing detection. However, the GMM classifier does not separately consider the scores of feature frames on each Gaussian component. In addition, the GMM accumulates the scores on all frames… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05605v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05605v1-abstract-full" style="display: none;"> The automatic speaker verification system is sometimes vulnerable to various spoofing attacks. The 2-class Gaussian Mixture Model classifier for genuine and spoofed speech is usually used as the baseline for spoofing detection. However, the GMM classifier does not separately consider the scores of feature frames on each Gaussian component. In addition, the GMM accumulates the scores on all frames independently, and does not consider their correlations. We propose the two-path GMM-ResNet and GMM-SENet models for spoofing detection, whose input is the Gaussian probability features based on two GMMs trained on genuine and spoofed speech respectively. The models consider not only the score distribution on GMM components, but also the relationship between adjacent frames. A two-step training scheme is applied to improve the system robustness. Experiments on the ASVspoof 2019 show that the LFCC+GMM-ResNet system can relatively reduce min-tDCF and EER by 76.1% and 76.3% on logical access scenario compared with the GMM, and the LFCC+GMM-SENet system by 94.4% and 95.4% on physical access scenario. After score fusion, the systems give the second-best results on both scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05605v1-abstract-full').style.display = 'none'; document.getElementById('2407.05605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05267">arXiv:2407.05267</a> <span> [<a href="https://arxiv.org/pdf/2407.05267">pdf</a>, <a href="https://arxiv.org/format/2407.05267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DTR: A Unified Deep Tensor Representation Framework for Multimedia Data Recovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Ting-Wei Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xi-Le Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jian-Li Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yi-Si Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Min Wang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiao-Xuan Bai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hong Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05267v1-abstract-short" style="display: inline;"> Recently, the transform-based tensor representation has attracted increasing attention in multimedia data (e.g., images and videos) recovery problems, which consists of two indispensable components, i.e., transform and characterization. Previously, the development of transform-based tensor representation mainly focuses on the transform aspect. Although several attempts consider using shallow matri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05267v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05267v1-abstract-full" style="display: none;"> Recently, the transform-based tensor representation has attracted increasing attention in multimedia data (e.g., images and videos) recovery problems, which consists of two indispensable components, i.e., transform and characterization. Previously, the development of transform-based tensor representation mainly focuses on the transform aspect. Although several attempts consider using shallow matrix factorization (e.g., singular value decomposition and negative matrix factorization) to characterize the frontal slices of transformed tensor (termed as latent tensor), the faithful characterization aspect is underexplored. To address this issue, we propose a unified Deep Tensor Representation (termed as DTR) framework by synergistically combining the deep latent generative module and the deep transform module. Especially, the deep latent generative module can faithfully generate the latent tensor as compared with shallow matrix factorization. The new DTR framework not only allows us to better understand the classic shallow representations, but also leads us to explore new representation. To examine the representation ability of the proposed DTR, we consider the representative multi-dimensional data recovery task and suggest an unsupervised DTR-based multi-dimensional data recovery model. Extensive experiments demonstrate that DTR achieves superior performance compared to state-of-the-art methods in both quantitative and qualitative aspects, especially for fine details recovery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05267v1-abstract-full').style.display = 'none'; document.getElementById('2407.05267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05233">arXiv:2407.05233</a> <span> [<a href="https://arxiv.org/pdf/2407.05233">pdf</a>, <a href="https://arxiv.org/format/2407.05233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Advancing Prompt Recovery in NLP: A Deep Dive into the Integration of Gemma-2b-it and Phi2 Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wei Xu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhicheng Ding</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jinxin Xu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Hao Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05233v1-abstract-short" style="display: inline;"> Prompt recovery, a crucial task in natural language processing, entails the reconstruction of prompts or instructions that language models use to convert input text into a specific output. Although pivotal, the design and effectiveness of prompts represent a challenging and relatively untapped field within NLP research. This paper delves into an exhaustive investigation of prompt recovery methodol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05233v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05233v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05233v1-abstract-full" style="display: none;"> Prompt recovery, a crucial task in natural language processing, entails the reconstruction of prompts or instructions that language models use to convert input text into a specific output. Although pivotal, the design and effectiveness of prompts represent a challenging and relatively untapped field within NLP research. This paper delves into an exhaustive investigation of prompt recovery methodologies, employing a spectrum of pre-trained language models and strategies. Our study is a comparative analysis aimed at gauging the efficacy of various models on a benchmark dataset, with the goal of pinpointing the most proficient approach for prompt recovery. Through meticulous experimentation and detailed analysis, we elucidate the outstanding performance of the Gemma-2b-it + Phi2 model + Pretrain. This model surpasses its counterparts, showcasing its exceptional capability in accurately reconstructing prompts for text transformation tasks. Our findings offer a significant contribution to the existing knowledge on prompt recovery, shedding light on the intricacies of prompt design and offering insightful perspectives for future innovations in text rewriting and the broader field of natural language processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05233v1-abstract-full').style.display = 'none'; document.getElementById('2407.05233v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yan%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yan%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository