CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 399 results for author: <span class="mathjax">Guo, D</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Guo%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Guo, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Guo%2C+D&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Guo, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16810">arXiv:2411.16810</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16810">pdf</a>, <a href="https://arxiv.org/format/2411.16810">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Discrete to Continuous: Generating Smooth Transition Poses from Sign Language Observation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shengeng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+J">Jiayi He</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+L">Lechao Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jingjing Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+R">Richang Hong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16810v1-abstract-short" style="display: inline;"> Generating continuous sign language videos from discrete segments is challenging due to the need for smooth transitions that preserve natural flow and meaning. Traditional approaches that simply concatenate isolated signs often result in abrupt transitions, disrupting video coherence. To address this, we propose a novel framework, Sign-D2C, that employs a conditional diffusion model to synthesize&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16810v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16810v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16810v1-abstract-full" style="display: none;"> Generating continuous sign language videos from discrete segments is challenging due to the need for smooth transitions that preserve natural flow and meaning. Traditional approaches that simply concatenate isolated signs often result in abrupt transitions, disrupting video coherence. To address this, we propose a novel framework, Sign-D2C, that employs a conditional diffusion model to synthesize contextually smooth transition frames, enabling the seamless construction of continuous sign language sequences. Our approach transforms the unsupervised problem of transition frame generation into a supervised training task by simulating the absence of transition frames through random masking of segments in long-duration sign videos. The model learns to predict these masked frames by denoising Gaussian noise, conditioned on the surrounding sign observations, allowing it to handle complex, unstructured transitions. During inference, we apply a linearly interpolating padding strategy that initializes missing frames through interpolation between boundary frames, providing a stable foundation for iterative refinement by the diffusion model. Extensive experiments on the PHOENIX14T, USTC-CSL100, and USTC-SLR500 datasets demonstrate the effectiveness of our method in producing continuous, natural sign language videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16810v1-abstract-full').style.display = 'none'; document.getElementById('2411.16810v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13226">arXiv:2411.13226</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13226">pdf</a>, <a href="https://arxiv.org/format/2411.13226">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AIDBench: A benchmark for evaluating the authorship identification capability of large language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wen%2C+Z">Zichen Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dadi Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Huishuai Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13226v1-abstract-short" style="display: inline;"> As large language models (LLMs) rapidly advance and integrate into daily life, the privacy risks they pose are attracting increasing attention. We focus on a specific privacy risk where LLMs may help identify the authorship of anonymous texts, which challenges the effectiveness of anonymity in real-world systems such as anonymous peer review systems. To investigate these risks, we present AIDBench&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13226v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13226v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13226v1-abstract-full" style="display: none;"> As large language models (LLMs) rapidly advance and integrate into daily life, the privacy risks they pose are attracting increasing attention. We focus on a specific privacy risk where LLMs may help identify the authorship of anonymous texts, which challenges the effectiveness of anonymity in real-world systems such as anonymous peer review systems. To investigate these risks, we present AIDBench, a new benchmark that incorporates several author identification datasets, including emails, blogs, reviews, articles, and research papers. AIDBench utilizes two evaluation methods: one-to-one authorship identification, which determines whether two texts are from the same author; and one-to-many authorship identification, which, given a query text and a list of candidate texts, identifies the candidate most likely written by the same author as the query text. We also introduce a Retrieval-Augmented Generation (RAG)-based method to enhance the large-scale authorship identification capabilities of LLMs, particularly when input lengths exceed the models&#39; context windows, thereby establishing a new baseline for authorship identification using LLMs. Our experiments with AIDBench demonstrate that LLMs can correctly guess authorship at rates well above random chance, revealing new privacy risks posed by these powerful models. The source code and data will be made publicly available after acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13226v1-abstract-full').style.display = 'none'; document.getElementById('2411.13226v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11278">arXiv:2411.11278</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11278">pdf</a>, <a href="https://arxiv.org/format/2411.11278">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Towards Open-Vocabulary Audio-Visual Event Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jinxing Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+R">Ruohao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yuxin Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Jingjing Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11278v1-abstract-short" style="display: inline;"> The Audio-Visual Event Localization (AVEL) task aims to temporally locate and classify video events that are both audible and visible. Most research in this field assumes a closed-set setting, which restricts these models&#39; ability to handle test data containing event categories absent (unseen) during training. Recently, a few studies have explored AVEL in an open-set setting, enabling the recognit&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11278v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11278v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11278v1-abstract-full" style="display: none;"> The Audio-Visual Event Localization (AVEL) task aims to temporally locate and classify video events that are both audible and visible. Most research in this field assumes a closed-set setting, which restricts these models&#39; ability to handle test data containing event categories absent (unseen) during training. Recently, a few studies have explored AVEL in an open-set setting, enabling the recognition of unseen events as ``unknown&#39;&#39;, but without providing category-specific semantics. In this paper, we advance the field by introducing the Open-Vocabulary Audio-Visual Event Localization (OV-AVEL) problem, which requires localizing audio-visual events and predicting explicit categories for both seen and unseen data at inference. To address this new task, we propose the OV-AVEBench dataset, comprising 24,800 videos across 67 real-life audio-visual scenes (seen:unseen = 46:21), each with manual segment-level annotation. We also establish three evaluation metrics for this task. Moreover, we investigate two baseline approaches, one training-free and one using a further fine-tuning paradigm. Specifically, we utilize the unified multimodal space from the pretrained ImageBind model to extract audio, visual, and textual (event classes) features. The training-free baseline then determines predictions by comparing the consistency of audio-text and visual-text feature similarities. The fine-tuning baseline incorporates lightweight temporal layers to encode temporal relations within the audio and visual modalities, using OV-AVEBench training data for model fine-tuning. We evaluate these baselines on the proposed OV-AVEBench dataset and discuss potential directions for future work in this new field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11278v1-abstract-full').style.display = 'none'; document.getElementById('2411.11278v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://github.com/jasongief/OV-AVEL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02115">arXiv:2411.02115</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02115">pdf</a>, <a href="https://arxiv.org/format/2411.02115">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> FedMoE-DA: Federated Mixture of Experts via Domain Aware Fine-grained Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Z">Ziwei Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W">Wenkuan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuanqing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Weijie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaoxi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+C+W">Chee Wei Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+C">Chuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Deke Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02115v1-abstract-short" style="display: inline;"> Federated learning (FL) is a collaborative machine learning approach that enables multiple clients to train models without sharing their private data. With the rise of deep learning, large-scale models have garnered significant attention due to their exceptional performance. However, a key challenge in FL is the limitation imposed by clients with constrained computational and communication resourc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02115v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02115v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02115v1-abstract-full" style="display: none;"> Federated learning (FL) is a collaborative machine learning approach that enables multiple clients to train models without sharing their private data. With the rise of deep learning, large-scale models have garnered significant attention due to their exceptional performance. However, a key challenge in FL is the limitation imposed by clients with constrained computational and communication resources, which hampers the deployment of these large models. The Mixture of Experts (MoE) architecture addresses this challenge with its sparse activation property, which reduces computational workload and communication demands during inference and updates. Additionally, MoE facilitates better personalization by allowing each expert to specialize in different subsets of the data distribution. To alleviate the communication burdens between the server and clients, we propose FedMoE-DA, a new FL model training framework that leverages the MoE architecture and incorporates a novel domain-aware, fine-grained aggregation strategy to enhance the robustness, personalizability, and communication efficiency simultaneously. Specifically, the correlation between both intra-client expert models and inter-client data heterogeneity is exploited. Moreover, we utilize peer-to-peer (P2P) communication between clients for selective expert model synchronization, thus significantly reducing the server-client transmissions. Experiments demonstrate that our FedMoE-DA achieves excellent performance while reducing the communication pressure on the server. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02115v1-abstract-full').style.display = 'none'; document.getElementById('2411.02115v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01307">arXiv:2411.01307</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.01307">pdf</a>, <a href="https://arxiv.org/format/2411.01307">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Multimodal Large Language Model Think Analogically? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Diandian Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+C">Cong Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+F">Fangfang Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dakui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wei Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yanbing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+J">Jianhui Fu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01307v1-abstract-short" style="display: inline;"> Analogical reasoning, particularly in multimodal contexts, is the foundation of human perception and creativity. Multimodal Large Language Model (MLLM) has recently sparked considerable discussion due to its emergent capabilities. In this paper, we delve into the multimodal analogical reasoning capability of MLLM. Specifically, we explore two facets: \textit{MLLM as an explainer} and \textit{MLLM&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01307v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01307v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01307v1-abstract-full" style="display: none;"> Analogical reasoning, particularly in multimodal contexts, is the foundation of human perception and creativity. Multimodal Large Language Model (MLLM) has recently sparked considerable discussion due to its emergent capabilities. In this paper, we delve into the multimodal analogical reasoning capability of MLLM. Specifically, we explore two facets: \textit{MLLM as an explainer} and \textit{MLLM as a predictor}. In \textit{MLLM as an explainer}, we primarily focus on whether MLLM can deeply comprehend multimodal analogical reasoning problems. We propose a unified prompt template and a method for harnessing the comprehension capabilities of MLLM to augment existing models. In \textit{MLLM as a predictor}, we aim to determine whether MLLM can directly solve multimodal analogical reasoning problems. The experiments show that our approach outperforms existing methods on popular datasets, providing preliminary evidence for the analogical reasoning capability of MLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01307v1-abstract-full').style.display = 'none'; document.getElementById('2411.01307v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00064">arXiv:2411.00064</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.00064">pdf</a>, <a href="https://arxiv.org/format/2411.00064">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The ISCSLP 2024 Conversational Voice Clone (CoVoC) Challenge: Tasks, Results and Findings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xia%2C+K">Kangxiang Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hanzhao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qingqing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lei Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+M">Minghui Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+P">Peng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00064v1-abstract-short" style="display: inline;"> The ISCSLP 2024 Conversational Voice Clone (CoVoC) Challenge aims to benchmark and advance zero-shot spontaneous style voice cloning, particularly focusing on generating spontaneous behaviors in conversational speech. The challenge comprises two tracks: an unconstrained track without limitation on data and model usage, and a constrained track only allowing the use of constrained open-source datase&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00064v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00064v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00064v1-abstract-full" style="display: none;"> The ISCSLP 2024 Conversational Voice Clone (CoVoC) Challenge aims to benchmark and advance zero-shot spontaneous style voice cloning, particularly focusing on generating spontaneous behaviors in conversational speech. The challenge comprises two tracks: an unconstrained track without limitation on data and model usage, and a constrained track only allowing the use of constrained open-source datasets. A 100-hour high-quality conversational speech dataset is also made available with the challenge. This paper details the data, tracks, submitted systems, evaluation results, and findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00064v1-abstract-full').style.display = 'none'; document.getElementById('2411.00064v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23815">arXiv:2410.23815</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.23815">pdf</a>, <a href="https://arxiv.org/format/2410.23815">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The NPU-HWC System for the ISCSLP 2024 Inspirational and Convincing Audio Generation Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+J">Jixun Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+K">Kangxiang Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z">Zhao Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Ziyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23815v1-abstract-short" style="display: inline;"> This paper presents the NPU-HWC system submitted to the ISCSLP 2024 Inspirational and Convincing Audio Generation Challenge 2024 (ICAGC). Our system consists of two modules: a speech generator for Track 1 and a background audio generator for Track 2. In Track 1, we employ Single-Codec to tokenize the speech into discrete tokens and use a language-model-based approach to achieve zero-shot speaking&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23815v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23815v1-abstract-full" style="display: none;"> This paper presents the NPU-HWC system submitted to the ISCSLP 2024 Inspirational and Convincing Audio Generation Challenge 2024 (ICAGC). Our system consists of two modules: a speech generator for Track 1 and a background audio generator for Track 2. In Track 1, we employ Single-Codec to tokenize the speech into discrete tokens and use a language-model-based approach to achieve zero-shot speaking style cloning. The Single-Codec effectively decouples timbre and speaking style at the token level, reducing the acoustic modeling burden on the autoregressive language model. Additionally, we use DSPGAN to upsample 16 kHz mel-spectrograms to high-fidelity 48 kHz waveforms. In Track 2, we propose a background audio generator based on large language models (LLMs). This system produces scene-appropriate accompaniment descriptions, synthesizes background audio with Tango 2, and integrates it with the speech generated by our Track 1 system. Our submission achieves the second place and the first place in Track 1 and Track 2 respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23815v1-abstract-full').style.display = 'none'; document.getElementById('2410.23815v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18267">arXiv:2410.18267</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18267">pdf</a>, <a href="https://arxiv.org/format/2410.18267">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Backdoor in Seconds: Unlocking Vulnerabilities in Large Pre-trained Models via Model Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dongliang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+M">Mengxuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+Z">Zihan Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+J">Junfeng Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Hartvigsen%2C+T">Thomas Hartvigsen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Sheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18267v2-abstract-short" style="display: inline;"> Large pre-trained models have achieved notable success across a range of downstream tasks. However, recent research shows that a type of adversarial attack ($\textit{i.e.,}$ backdoor attack) can manipulate the behavior of machine learning models through contaminating their training dataset, posing significant threat in the real-world application of large pre-trained model, especially for those cus&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18267v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18267v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18267v2-abstract-full" style="display: none;"> Large pre-trained models have achieved notable success across a range of downstream tasks. However, recent research shows that a type of adversarial attack ($\textit{i.e.,}$ backdoor attack) can manipulate the behavior of machine learning models through contaminating their training dataset, posing significant threat in the real-world application of large pre-trained model, especially for those customized models. Therefore, addressing the unique challenges for exploring vulnerability of pre-trained models is of paramount importance. Through empirical studies on the capability for performing backdoor attack in large pre-trained models ($\textit{e.g.,}$ ViT), we find the following unique challenges of attacking large pre-trained models: 1) the inability to manipulate or even access large training datasets, and 2) the substantial computational resources required for training or fine-tuning these models. To address these challenges, we establish new standards for an effective and feasible backdoor attack in the context of large pre-trained models. In line with these standards, we introduce our EDT model, an \textbf{E}fficient, \textbf{D}ata-free, \textbf{T}raining-free backdoor attack method. Inspired by model editing techniques, EDT injects an editing-based lightweight codebook into the backdoor of large pre-trained models, which replaces the embedding of the poisoned image with the target image without poisoning the training dataset or training the victim model. Our experiments, conducted across various pre-trained models such as ViT, CLIP, BLIP, and stable diffusion, and on downstream tasks including image classification, image captioning, and image generation, demonstrate the effectiveness of our method. Our code is available in the supplementary material. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18267v2-abstract-full').style.display = 'none'; document.getElementById('2410.18267v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09548">arXiv:2410.09548</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.09548">pdf</a>, <a href="https://arxiv.org/format/2410.09548">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TWC.2024.3476943">10.1109/TWC.2024.3476943 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Air-to-Ground Communications Beyond 5G: CoMP Handoff Management in UAV Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Deke Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+L">Lailong Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+M">Minghua Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09548v1-abstract-short" style="display: inline;"> Air-to-ground (A2G) networks, using unmanned aerial vehicles (UAVs) as base stations to serve terrestrial user equipments (UEs), are promising for extending the spatial coverage capability in future communication systems. Coordinated transmission among multiple UAVs significantly improves network coverage and throughput compared to a single UAV transmission. However, implementing coordinated multi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09548v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09548v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09548v1-abstract-full" style="display: none;"> Air-to-ground (A2G) networks, using unmanned aerial vehicles (UAVs) as base stations to serve terrestrial user equipments (UEs), are promising for extending the spatial coverage capability in future communication systems. Coordinated transmission among multiple UAVs significantly improves network coverage and throughput compared to a single UAV transmission. However, implementing coordinated multi-point (CoMP) transmission for UAV mobility requires complex cooperation procedures, regardless of the handoff mechanism involved. This paper designs a novel CoMP transmission strategy that enables terrestrial UEs to achieve reliable and seamless connections with mobile UAVs. Specifically, a computationally efficient CoMP transmission method based on the theory of Poisson-Delaunay triangulation is developed, where an efficient subdivision search strategy for a CoMP UAV set is designed to minimize search overhead by a divide-and-conquer approach. For concrete performance evaluation, the cooperative handoff probability of the typical UE is analyzed, and the coverage probability with handoffs is derived. Simulation results demonstrate that the proposed scheme outperforms the conventional Voronoi scheme with the nearest serving UAV regarding coverage probabilities with handoffs. Moreover, each UE has a fixed and unique serving UAV set to avoid real-time dynamic UAV searching and achieve effective load balancing, significantly reducing system resource costs and enhancing network coverage performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09548v1-abstract-full').style.display = 'none'; document.getElementById('2410.09548v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 18 figures, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07589">arXiv:2410.07589</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07589">pdf</a>, <a href="https://arxiv.org/format/2410.07589">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> No Free Lunch: Retrieval-Augmented Generation Undermines Fairness in LLMs, Even for Vigilant Users </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+M">Mengxuan Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Hongyi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Guan%2C+Z">Zihan Guan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+R">Ronghang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dongliang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Qi%2C+D">Daiqing Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Sheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07589v1-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) is widely adopted for its effectiveness and cost-efficiency in mitigating hallucinations and enhancing the domain-specific generation capabilities of large language models (LLMs). However, is this effectiveness and cost-efficiency truly a free lunch? In this study, we comprehensively investigate the fairness costs associated with RAG by proposing a practical th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07589v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07589v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07589v1-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) is widely adopted for its effectiveness and cost-efficiency in mitigating hallucinations and enhancing the domain-specific generation capabilities of large language models (LLMs). However, is this effectiveness and cost-efficiency truly a free lunch? In this study, we comprehensively investigate the fairness costs associated with RAG by proposing a practical three-level threat model from the perspective of user awareness of fairness. Specifically, varying levels of user fairness awareness result in different degrees of fairness censorship on the external dataset. We examine the fairness implications of RAG using uncensored, partially censored, and fully censored datasets. Our experiments demonstrate that fairness alignment can be easily undermined through RAG without the need for fine-tuning or retraining. Even with fully censored and supposedly unbiased external datasets, RAG can lead to biased outputs. Our findings underscore the limitations of current alignment methods in the context of RAG-based LLMs and highlight the urgent need for new strategies to ensure fairness. We propose potential mitigations and call for further research to develop robust fairness safeguards in RAG-based LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07589v1-abstract-full').style.display = 'none'; document.getElementById('2410.07589v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05767">arXiv:2410.05767</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05767">pdf</a>, <a href="https://arxiv.org/format/2410.05767">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Grounding is All You Need? Dual Temporal Grounding for Video Dialog </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qin%2C+Y">You Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+W">Wei Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+X">Xinze Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Fei%2C+H">Hao Fei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zimmermann%2C+R">Roger Zimmermann</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+L">Lizi Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05767v2-abstract-short" style="display: inline;"> In the realm of video dialog response generation, the understanding of video content and the temporal nuances of conversation history are paramount. While a segment of current research leans heavily on large-scale pretrained visual-language models and often overlooks temporal dynamics, another delves deep into spatial-temporal relationships within videos but demands intricate object trajectory pre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05767v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05767v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05767v2-abstract-full" style="display: none;"> In the realm of video dialog response generation, the understanding of video content and the temporal nuances of conversation history are paramount. While a segment of current research leans heavily on large-scale pretrained visual-language models and often overlooks temporal dynamics, another delves deep into spatial-temporal relationships within videos but demands intricate object trajectory pre-extractions and sidelines dialog temporal dynamics. This paper introduces the Dual Temporal Grounding-enhanced Video Dialog model (DTGVD), strategically designed to merge the strengths of both dominant approaches. It emphasizes dual temporal relationships by predicting dialog turn-specific temporal regions, filtering video content accordingly, and grounding responses in both video and dialog contexts. One standout feature of DTGVD is its heightened attention to chronological interplay. By recognizing and acting upon the dependencies between different dialog turns, it captures more nuanced conversational dynamics. To further bolster the alignment between video and dialog temporal dynamics, we&#39;ve implemented a list-wise contrastive learning strategy. Within this framework, accurately grounded turn-clip pairings are designated as positive samples, while less precise pairings are categorized as negative. This refined classification is then funneled into our holistic end-to-end response generation mechanism. Evaluations using AVSD@DSTC-7 and AVSD@DSTC-8 datasets underscore the superiority of our methodology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05767v2-abstract-full').style.display = 'none'; document.getElementById('2410.05767v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04797">arXiv:2410.04797</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04797">pdf</a>, <a href="https://arxiv.org/format/2410.04797">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Attentive-based Multi-level Feature Fusion for Voice Disorder Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shen%2C+L">Lipeng Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yifan Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dongyue Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Mo%2C+W">Wei Mo</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+L">Lingyu Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hui Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04797v1-abstract-short" style="display: inline;"> Voice disorders negatively impact the quality of daily life in various ways. However, accurately recognizing the category of pathological features from raw audio remains a considerable challenge due to the limited dataset. A promising method to handle this issue is extracting multi-level pathological information from speech in a comprehensive manner by fusing features in the latent space. In this&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04797v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04797v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04797v1-abstract-full" style="display: none;"> Voice disorders negatively impact the quality of daily life in various ways. However, accurately recognizing the category of pathological features from raw audio remains a considerable challenge due to the limited dataset. A promising method to handle this issue is extracting multi-level pathological information from speech in a comprehensive manner by fusing features in the latent space. In this paper, a novel framework is designed to explore the way of high-quality feature fusion for effective and generalized detection performance. Specifically, the proposed model follows a two-stage training paradigm: (1) ECAPA-TDNN and Wav2vec 2.0 which have shown remarkable effectiveness in various domains are employed to learn the universal pathological information from raw audio; (2) An attentive fusion module is dedicatedly designed to establish the interaction between pathological features projected by EcapTdnn and Wav2vec 2.0 respectively and guide the multi-layer fusion, the entire model is jointly fine-tuned from pre-trained features by the automatic voice pathology detection task. Finally, comprehensive experiments on the FEMH and SVD datasets demonstrate that the proposed framework outperforms the competitive baselines, and achieves the accuracy of 90.51% and 87.68%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04797v1-abstract-full').style.display = 'none'; document.getElementById('2410.04797v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04689">arXiv:2410.04689</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.04689">pdf</a>, <a href="https://arxiv.org/format/2410.04689">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Low-Rank Continual Pyramid Vision Transformer: Incrementally Segment Whole-Body Organs in CT with Light-Weighted Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+V">Vince Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+Z">Zhanghexuan Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dazhou Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Puyang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xia%2C+Y">Yingda Xia</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+X">Xianghua Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+W">Wei Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+D">Dakai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04689v1-abstract-short" style="display: inline;"> Deep segmentation networks achieve high performance when trained on specific datasets. However, in clinical practice, it is often desirable that pretrained segmentation models can be dynamically extended to enable segmenting new organs without access to previous training datasets or without training from scratch. This would ensure a much more efficient model development and deployment paradigm acc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04689v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04689v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04689v1-abstract-full" style="display: none;"> Deep segmentation networks achieve high performance when trained on specific datasets. However, in clinical practice, it is often desirable that pretrained segmentation models can be dynamically extended to enable segmenting new organs without access to previous training datasets or without training from scratch. This would ensure a much more efficient model development and deployment paradigm accounting for the patient privacy and data storage issues. This clinically preferred process can be viewed as a continual semantic segmentation (CSS) problem. Previous CSS works would either experience catastrophic forgetting or lead to unaffordable memory costs as models expand. In this work, we propose a new continual whole-body organ segmentation model with light-weighted low-rank adaptation (LoRA). We first train and freeze a pyramid vision transformer (PVT) base segmentation model on the initial task, then continually add light-weighted trainable LoRA parameters to the frozen model for each new learning task. Through a holistically exploration of the architecture modification, we identify three most important layers (i.e., patch-embedding, multi-head attention and feed forward layers) that are critical in adapting to the new segmentation tasks, while retaining the majority of the pretrained parameters fixed. Our proposed model continually segments new organs without catastrophic forgetting and meanwhile maintaining a low parameter increasing rate. Continually trained and tested on four datasets covering different body parts of a total of 121 organs, results show that our model achieves high segmentation accuracy, closely reaching the PVT and nnUNet upper bounds, and significantly outperforms other regularization-based CSS methods. When comparing to the leading architecture-based CSS method, our model has a substantial lower parameter increasing rate while achieving comparable performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04689v1-abstract-full').style.display = 'none'; document.getElementById('2410.04689v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Medical Image Computing and Computer Assisted Intervention -- MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02712">arXiv:2410.02712</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02712">pdf</a>, <a href="https://arxiv.org/format/2410.02712">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-Critic: Learning to Evaluate Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+T">Tianyi Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xiyao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+Q">Qinghao Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+H">Haoqi Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Q">Quanquan Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+H">Heng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunyuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02712v1-abstract-short" style="display: inline;"> We introduce LLaVA-Critic, the first open-source large multimodal model (LMM) designed as a generalist evaluator to assess performance across a wide range of multimodal tasks. LLaVA-Critic is trained using a high-quality critic instruction-following dataset that incorporates diverse evaluation criteria and scenarios. Our experiments demonstrate the model&#39;s effectiveness in two key areas: (1) LMM-a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02712v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02712v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02712v1-abstract-full" style="display: none;"> We introduce LLaVA-Critic, the first open-source large multimodal model (LMM) designed as a generalist evaluator to assess performance across a wide range of multimodal tasks. LLaVA-Critic is trained using a high-quality critic instruction-following dataset that incorporates diverse evaluation criteria and scenarios. Our experiments demonstrate the model&#39;s effectiveness in two key areas: (1) LMM-as-a-Judge, where LLaVA-Critic provides reliable evaluation scores, performing on par with or surpassing GPT models on multiple evaluation benchmarks; and (2) Preference Learning, where it generates reward signals for preference learning, enhancing model alignment capabilities. This work underscores the potential of open-source LMMs in self-critique and evaluation, setting the stage for future research into scalable, superhuman alignment feedback mechanisms for LMMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02712v1-abstract-full').style.display = 'none'; document.getElementById('2410.02712v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://llava-vl.github.io/blog/2024-10-03-llava-critic</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19690">arXiv:2409.19690</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.19690">pdf</a>, <a href="https://arxiv.org/format/2409.19690">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Neural-Polyptych: Content Controllable Painting Recreation for Diverse Genres </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yiming Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dewen Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Lian%2C+Z">Zhouhui Lian</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Y">Yue Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jianhong Han</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+J">Jie Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guoping Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+B">Bingfeng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Sheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19690v1-abstract-short" style="display: inline;"> To bridge the gap between artists and non-specialists, we present a unified framework, Neural-Polyptych, to facilitate the creation of expansive, high-resolution paintings by seamlessly incorporating interactive hand-drawn sketches with fragments from original paintings. We have designed a multi-scale GAN-based architecture to decompose the generation process into two parts, each responsible for i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19690v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19690v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19690v1-abstract-full" style="display: none;"> To bridge the gap between artists and non-specialists, we present a unified framework, Neural-Polyptych, to facilitate the creation of expansive, high-resolution paintings by seamlessly incorporating interactive hand-drawn sketches with fragments from original paintings. We have designed a multi-scale GAN-based architecture to decompose the generation process into two parts, each responsible for identifying global and local features. To enhance the fidelity of semantic details generated from users&#39; sketched outlines, we introduce a Correspondence Attention module utilizing our Reference Bank strategy. This ensures the creation of high-quality, intricately detailed elements within the artwork. The final result is achieved by carefully blending these local elements while preserving coherent global consistency. Consequently, this methodology enables the production of digital paintings at megapixel scale, accommodating diverse artistic expressions and enabling users to recreate content in a controlled manner. We validate our approach to diverse genres of both Eastern and Western paintings. Applications such as large painting extension, texture shuffling, genre switching, mural art restoration, and recomposition can be successfully based on our framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19690v1-abstract-full').style.display = 'none'; document.getElementById('2409.19690v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computational Visual Media, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17655">arXiv:2409.17655</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.17655">pdf</a>, <a href="https://arxiv.org/format/2409.17655">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> AssistantX: An LLM-Powered Proactive Assistant in Collaborative Human-Populated Environment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+N">Nan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+B">Bo Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yongchang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lumeng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Di Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Huaping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17655v1-abstract-short" style="display: inline;"> The increasing demand for intelligent assistants in human-populated environments has motivated significant research in autonomous robotic systems. Traditional service robots and virtual assistants, however, struggle with real-world task execution due to their limited capacity for dynamic reasoning and interaction, particularly when human collaboration is required. Recent developments in Large Lang&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17655v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17655v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17655v1-abstract-full" style="display: none;"> The increasing demand for intelligent assistants in human-populated environments has motivated significant research in autonomous robotic systems. Traditional service robots and virtual assistants, however, struggle with real-world task execution due to their limited capacity for dynamic reasoning and interaction, particularly when human collaboration is required. Recent developments in Large Language Models have opened new avenues for improving these systems, enabling more sophisticated reasoning and natural interaction capabilities. In this paper, we introduce AssistantX, an LLM-powered proactive assistant designed to operate autonomously in a physical office environment. Unlike conventional service robots, AssistantX leverages a novel multi-agent architecture, PPDR4X, which provides advanced inference capabilities and comprehensive collaboration awareness. By effectively bridging the gap between virtual operations and physical interactions, AssistantX demonstrates robust performance in managing complex real-world scenarios. Our evaluation highlights the architecture&#39;s effectiveness, showing that AssistantX can respond to clear instructions, actively retrieve supplementary information from memory, and proactively seek collaboration from team members to ensure successful task completion. More details and videos can be found at https://assistantx-agent.github.io/AssistantX/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17655v1-abstract-full').style.display = 'none'; document.getElementById('2409.17655v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 8 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15834">arXiv:2409.15834</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.15834">pdf</a>, <a href="https://arxiv.org/format/2409.15834">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning Techniques for Automatic Lateral X-ray Cephalometric Landmark Detection: Is the Problem Solved? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Ching-Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Muzakky%2C+H">Hikam Muzakky</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+J">Juan Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuguang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chenglong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Q">Qian Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+X">Xianan Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+K">Kunlun Xu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+P">Pengfei He</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dongqian Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xianlong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hyunseok Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Z">Zhangnan Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Z">Zhu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+B">Bingsheng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15834v1-abstract-short" style="display: inline;"> Localization of the craniofacial landmarks from lateral cephalograms is a fundamental task in cephalometric analysis. The automation of the corresponding tasks has thus been the subject of intense research over the past decades. In this paper, we introduce the &#34;Cephalometric Landmark Detection (CL-Detection)&#34; dataset, which is the largest publicly available and comprehensive dataset for cephalomet&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15834v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15834v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15834v1-abstract-full" style="display: none;"> Localization of the craniofacial landmarks from lateral cephalograms is a fundamental task in cephalometric analysis. The automation of the corresponding tasks has thus been the subject of intense research over the past decades. In this paper, we introduce the &#34;Cephalometric Landmark Detection (CL-Detection)&#34; dataset, which is the largest publicly available and comprehensive dataset for cephalometric landmark detection. This multi-center and multi-vendor dataset includes 600 lateral X-ray images with 38 landmarks acquired with different equipment from three medical centers. The overarching objective of this paper is to measure how far state-of-the-art deep learning methods can go for cephalometric landmark detection. Following the 2023 MICCAI CL-Detection Challenge, we report the results of the top ten research groups using deep learning methods. Results show that the best methods closely approximate the expert analysis, achieving a mean detection rate of 75.719% and a mean radial error of 1.518 mm. While there is room for improvement, these findings undeniably open the door to highly accurate and fully automatic location of craniofacial landmarks. We also identify scenarios for which deep learning methods are still failing. Both the dataset and detailed results are publicly available online, while the platform will remain open for the community to benchmark future algorithm developments at https://cl-detection2023.grand-challenge.org/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15834v1-abstract-full').style.display = 'none'; document.getElementById('2409.15834v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14319">arXiv:2409.14319</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.14319">pdf</a>, <a href="https://arxiv.org/format/2409.14319">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Scene-Text Grounding for Text-Based Video Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+S">Sheng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+J">Junbin Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+P">Peipei Song</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+A">Angela Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chua%2C+T">Tat-Seng Chua</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14319v1-abstract-short" style="display: inline;"> Existing efforts in text-based video question answering (TextVideoQA) are criticized for their opaque decisionmaking and heavy reliance on scene-text recognition. In this paper, we propose to study Grounded TextVideoQA by forcing models to answer questions and spatio-temporally localize the relevant scene-text regions, thus decoupling QA from scenetext recognition and promoting research towards in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14319v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14319v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14319v1-abstract-full" style="display: none;"> Existing efforts in text-based video question answering (TextVideoQA) are criticized for their opaque decisionmaking and heavy reliance on scene-text recognition. In this paper, we propose to study Grounded TextVideoQA by forcing models to answer questions and spatio-temporally localize the relevant scene-text regions, thus decoupling QA from scenetext recognition and promoting research towards interpretable QA. The task has three-fold significance. First, it encourages scene-text evidence versus other short-cuts for answer predictions. Second, it directly accepts scene-text regions as visual answers, thus circumventing the problem of ineffective answer evaluation by stringent string matching. Third, it isolates the challenges inherited in VideoQA and scene-text recognition. This enables the diagnosis of the root causes for failure predictions, e.g., wrong QA or wrong scene-text recognition? To achieve Grounded TextVideoQA, we propose the T2S-QA model that highlights a disentangled temporal-to-spatial contrastive learning strategy for weakly-supervised scene-text grounding and grounded TextVideoQA. To facilitate evaluation, we construct a new dataset ViTXT-GQA which features 52K scene-text bounding boxes within 2.2K temporal segments related to 2K questions and 729 videos. With ViTXT-GQA, we perform extensive experiments and demonstrate the severe limitations of existing techniques in Grounded TextVideoQA. While T2S-QA achieves superior results, the large performance gap with human leaves ample space for improvement. Our further analysis of oracle scene-text inputs posits that the major challenge is scene-text recognition. To advance the research of Grounded TextVideoQA, our dataset and code are at \url{https://github.com/zhousheng97/ViTXT-GQA.git} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14319v1-abstract-full').style.display = 'none'; document.getElementById('2409.14319v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13551">arXiv:2409.13551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.13551">pdf</a>, <a href="https://arxiv.org/format/2409.13551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3691620.3695503">10.1145/3691620.3695503 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Contextualized Data-Wrangling Code Generation in Computational Notebooks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenglong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+J">Jiazhen Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+S">Shuai Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Inala%2C+J+P">Jeevana Priya Inala</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+C">Cong Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jianfeng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+N">Nan Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+M+R">Michael R. Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13551v1-abstract-short" style="display: inline;"> Data wrangling, the process of preparing raw data for further analysis in computational notebooks, is a crucial yet time-consuming step in data science. Code generation has the potential to automate the data wrangling process to reduce analysts&#39; overhead by translating user intents into executable code. Precisely generating data wrangling code necessitates a comprehensive consideration of the rich&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13551v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13551v1-abstract-full" style="display: none;"> Data wrangling, the process of preparing raw data for further analysis in computational notebooks, is a crucial yet time-consuming step in data science. Code generation has the potential to automate the data wrangling process to reduce analysts&#39; overhead by translating user intents into executable code. Precisely generating data wrangling code necessitates a comprehensive consideration of the rich context present in notebooks, including textual context, code context and data context. However, notebooks often interleave multiple non-linear analysis tasks into linear sequence of code blocks, where the contextual dependencies are not clearly reflected. Directly training models with source code blocks fails to fully exploit the contexts for accurate wrangling code generation. To bridge the gap, we aim to construct a high quality datasets with clear and rich contexts to help training models for data wrangling code generation tasks. In this work, we first propose an automated approach, CoCoMine to mine data-wrangling code generation examples with clear multi-modal contextual dependency. It first adopts data flow analysis to identify the code blocks containing data wrangling codes. Then, CoCoMine extracts the contextualized datawrangling code examples through tracing and replaying notebooks. With CoCoMine, we construct CoCoNote, a dataset containing 58,221 examples for Contextualized Data-wrangling Code generation in Notebooks. To demonstrate the effectiveness of our dataset, we finetune a range of pretrained code models and prompt various large language models on our task. Furthermore, we also propose DataCoder, which encodes data context and code&amp;textual contexts separately to enhance code generation. Experiment results demonstrate the significance of incorporating data context in data-wrangling code generation and the effectiveness of our model. We release code and data at url... <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13551v1-abstract-full').style.display = 'none'; document.getElementById('2409.13551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ASE 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03421">arXiv:2409.03421</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.03421">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> F3T: A soft tactile unit with 3D force and temperature mathematical decoupling ability for robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+H">Hao Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Ling%2C+Z">Zhengrong Ling</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tieshan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Gen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yifeng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Haoxiang Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiale Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+H">Hongyuan Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J">Jia Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yajing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03421v1-abstract-short" style="display: inline;"> The human skin exhibits remarkable capability to perceive contact forces and environmental temperatures, providing intricate information essential for nuanced manipulation. Despite recent advancements in soft tactile sensors, a significant challenge remains in accurately decoupling signals - specifically, separating force from directional orientation and temperature - resulting in fail to meet the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03421v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03421v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03421v1-abstract-full" style="display: none;"> The human skin exhibits remarkable capability to perceive contact forces and environmental temperatures, providing intricate information essential for nuanced manipulation. Despite recent advancements in soft tactile sensors, a significant challenge remains in accurately decoupling signals - specifically, separating force from directional orientation and temperature - resulting in fail to meet the advanced application requirements of robots. This research proposes a multi-layered soft sensor unit (F3T) designed to achieve isolated measurements and mathematical decoupling of normal pressure, omnidirectional tangential forces, and temperature. We developed a circular coaxial magnetic film featuring a floating-mountain multi-layer capacitor, facilitating the physical decoupling of normal and tangential forces in all directions. Additionally, we incorporated an ion gel-based temperature sensing film atop the tactile sensor. This sensor is resilient to external pressure and deformation, enabling it to measure temperature and, crucially, eliminate capacitor errors induced by environmental temperature changes. This innovative design allows for the decoupled measurement of multiple signals, paving the way for advancements in higher-level robot motion control, autonomous decision-making, and task planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03421v1-abstract-full').style.display = 'none'; document.getElementById('2409.03421v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00933">arXiv:2409.00933</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00933">pdf</a>, <a href="https://arxiv.org/format/2409.00933">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SoCodec: A Semantic-Ordered Multi-Stream Speech Codec for Efficient Language Model Based Text-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Haohan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+K">Kun Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00933v1-abstract-short" style="display: inline;"> The long speech sequence has been troubling language models (LM) based TTS approaches in terms of modeling complexity and efficiency. This work proposes SoCodec, a semantic-ordered multi-stream speech codec, to address this issue. It compresses speech into a shorter, multi-stream discrete semantic sequence with multiple tokens at each frame. Meanwhile, the ordered product quantization is proposed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00933v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00933v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00933v1-abstract-full" style="display: none;"> The long speech sequence has been troubling language models (LM) based TTS approaches in terms of modeling complexity and efficiency. This work proposes SoCodec, a semantic-ordered multi-stream speech codec, to address this issue. It compresses speech into a shorter, multi-stream discrete semantic sequence with multiple tokens at each frame. Meanwhile, the ordered product quantization is proposed to constrain this sequence into an ordered representation. It can be applied with a multi-stream delayed LM to achieve better autoregressive generation along both time and stream axes in TTS. The experimental result strongly demonstrates the effectiveness of the proposed approach, achieving superior performance over baseline systems even if compressing the frameshift of speech from 20ms to 240ms (12x). The ablation studies further validate the importance of learning the proposed ordered multi-stream semantic representation in pursuing shorter speech sequences for efficient LM-based TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00933v1-abstract-full').style.display = 'none'; document.getElementById('2409.00933v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12674">arXiv:2408.12674</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.12674">pdf</a>, <a href="https://arxiv.org/format/2408.12674">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-shot Video Imitation via Parameterized Symbolic Abstraction Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jianren Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Kangni Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dingkun Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+X">Xian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Atkeson%2C+C+G">Christopher G Atkeson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12674v2-abstract-short" style="display: inline;"> Learning to manipulate dynamic and deformable objects from a single demonstration video holds great promise in terms of scalability. Previous approaches have predominantly focused on either replaying object relationships or actor trajectories. The former often struggles to generalize across diverse tasks, while the latter suffers from data inefficiency. Moreover, both methodologies encounter chall&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12674v2-abstract-full').style.display = 'inline'; document.getElementById('2408.12674v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12674v2-abstract-full" style="display: none;"> Learning to manipulate dynamic and deformable objects from a single demonstration video holds great promise in terms of scalability. Previous approaches have predominantly focused on either replaying object relationships or actor trajectories. The former often struggles to generalize across diverse tasks, while the latter suffers from data inefficiency. Moreover, both methodologies encounter challenges in capturing invisible physical attributes, such as forces. In this paper, we propose to interpret video demonstrations through Parameterized Symbolic Abstraction Graphs (PSAG), where nodes represent objects and edges denote relationships between objects. We further ground geometric constraints through simulation to estimate non-geometric, visually imperceptible attributes. The augmented PSAG is then applied in real robot experiments. Our approach has been validated across a range of tasks, such as Cutting Avocado, Cutting Vegetable, Pouring Liquid, Rolling Dough, and Slicing Pizza. We demonstrate successful generalization to novel objects with distinct visual and physical properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12674v2-abstract-full').style.display = 'none'; document.getElementById('2408.12674v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Robot Learning, Computer Vision, Learning from Videos</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10538">arXiv:2408.10538</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10538">pdf</a>, <a href="https://arxiv.org/format/2408.10538">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Surgical Workflow Recognition and Blocking Effectiveness Detection in Laparoscopic Liver Resections with Pringle Maneuver </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Diandian Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+W">Weixin Si</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhixi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Pei%2C+J">Jialun Pei</a>, <a href="/search/cs?searchtype=author&amp;query=Heng%2C+P">Pheng-Ann Heng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10538v3-abstract-short" style="display: inline;"> Pringle maneuver (PM) in laparoscopic liver resection aims to reduce blood loss and provide a clear surgical view by intermittently blocking blood inflow of the liver, whereas prolonged PM may cause ischemic injury. To comprehensively monitor this surgical procedure and provide timely warnings of ineffective and prolonged blocking, we suggest two complementary AI-assisted surgical monitoring tasks&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10538v3-abstract-full').style.display = 'inline'; document.getElementById('2408.10538v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10538v3-abstract-full" style="display: none;"> Pringle maneuver (PM) in laparoscopic liver resection aims to reduce blood loss and provide a clear surgical view by intermittently blocking blood inflow of the liver, whereas prolonged PM may cause ischemic injury. To comprehensively monitor this surgical procedure and provide timely warnings of ineffective and prolonged blocking, we suggest two complementary AI-assisted surgical monitoring tasks: workflow recognition and blocking effectiveness detection in liver resections. The former presents challenges in real-time capturing of short-term PM, while the latter involves the intraoperative discrimination of long-term liver ischemia states. To address these challenges, we meticulously collect a novel dataset, called PmLR50, consisting of 25,037 video frames covering various surgical phases from 50 laparoscopic liver resection procedures. Additionally, we develop an online baseline for PmLR50, termed PmNet. This model embraces Masked Temporal Encoding (MTE) and Compressed Sequence Modeling (CSM) for efficient short-term and long-term temporal information modeling, and embeds Contrastive Prototype Separation (CPS) to enhance action discrimination between similar intraoperative operations. Experimental results demonstrate that PmNet outperforms existing state-of-the-art surgical workflow recognition methods on the PmLR50 benchmark. Our research offers potential clinical applications for the laparoscopic liver surgery community. Source code and data will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10538v3-abstract-full').style.display = 'none'; document.getElementById('2408.10538v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03326">arXiv:2408.03326</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.03326">pdf</a>, <a href="https://arxiv.org/format/2408.03326">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLaVA-OneVision: Easy Visual Task Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yuanhan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+F">Feng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaichen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+P">Peiyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yanwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chunyuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03326v3-abstract-short" style="display: inline;"> We present LLaVA-OneVision, a family of open large multimodal models (LMMs) developed by consolidating our insights into data, models, and visual representations in the LLaVA-NeXT blog series. Our experimental results demonstrate that LLaVA-OneVision is the first single model that can simultaneously push the performance boundaries of open LMMs in three important computer vision scenarios: single-i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03326v3-abstract-full').style.display = 'inline'; document.getElementById('2408.03326v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03326v3-abstract-full" style="display: none;"> We present LLaVA-OneVision, a family of open large multimodal models (LMMs) developed by consolidating our insights into data, models, and visual representations in the LLaVA-NeXT blog series. Our experimental results demonstrate that LLaVA-OneVision is the first single model that can simultaneously push the performance boundaries of open LMMs in three important computer vision scenarios: single-image, multi-image, and video scenarios. Importantly, the design of LLaVA-OneVision allows strong transfer learning across different modalities/scenarios, yielding new emerging capabilities. In particular, strong video understanding and cross-scenario capabilities are demonstrated through task transfer from images to videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03326v3-abstract-full').style.display = 'none'; document.getElementById('2408.03326v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Homepage: https://llava-vl.github.io/blog/2024-08-05-llava-onevision/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03097">arXiv:2408.03097</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.03097">pdf</a>, <a href="https://arxiv.org/format/2408.03097">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prototype Learning for Micro-gesture Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guoliang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiliang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+H">Hehe Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yi Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03097v1-abstract-short" style="display: inline;"> In this paper, we briefly introduce the solution developed by our team, HFUT-VUT, for the track of Micro-gesture Classification in the MiGA challenge at IJCAI 2024. The task of micro-gesture classification task involves recognizing the category of a given video clip, which focuses on more fine-grained and subtle body movements compared to typical action recognition tasks. Given the inherent comple&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03097v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03097v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03097v1-abstract-full" style="display: none;"> In this paper, we briefly introduce the solution developed by our team, HFUT-VUT, for the track of Micro-gesture Classification in the MiGA challenge at IJCAI 2024. The task of micro-gesture classification task involves recognizing the category of a given video clip, which focuses on more fine-grained and subtle body movements compared to typical action recognition tasks. Given the inherent complexity of micro-gesture recognition, which includes large intra-class variability and minimal inter-class differences, we utilize two innovative modules, i.e., the cross-modal fusion module and prototypical refinement module, to improve the discriminative ability of MG features, thereby improving the classification accuracy. Our solution achieved significant success, ranking 1st in the track of Micro-gesture Classification. We surpassed the performance of last year&#39;s leading team by a substantial margin, improving Top-1 accuracy by 6.13%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03097v1-abstract-full').style.display = 'none'; document.getElementById('2408.03097v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">1st Place in Micro-gesture Classification in MiGA at IJCAI-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21368">arXiv:2407.21368</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.21368">pdf</a>, <a href="https://arxiv.org/format/2407.21368">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prompting Medical Large Vision-Language Models to Diagnose Pathologies by Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Danfeng Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Terzopoulos%2C+D">Demetri Terzopoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21368v1-abstract-short" style="display: inline;"> Large Vision-Language Models (LVLMs) have achieved significant success in recent years, and they have been extended to the medical domain. Although demonstrating satisfactory performance on medical Visual Question Answering (VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem, which makes them fail to diagnose complex pathologies. Moreover, they readily fail to learn minority&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21368v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21368v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21368v1-abstract-full" style="display: none;"> Large Vision-Language Models (LVLMs) have achieved significant success in recent years, and they have been extended to the medical domain. Although demonstrating satisfactory performance on medical Visual Question Answering (VQA) tasks, Medical LVLMs (MLVLMs) suffer from the hallucination problem, which makes them fail to diagnose complex pathologies. Moreover, they readily fail to learn minority pathologies due to imbalanced training data. We propose two prompting strategies for MLVLMs that reduce hallucination and improve VQA performance. In the first strategy, we provide a detailed explanation of the queried pathology. In the second strategy, we fine-tune a cheap, weak learner to achieve high performance on a specific metric, and textually provide its judgment to the MLVLM. Tested on the MIMIC-CXR-JPG and Chexpert datasets, our methods significantly improve the diagnostic F1 score, with the highest increase being 0.27. We also demonstrate that our prompting strategies can be extended to general LVLM domains. Based on POPE metrics, it effectively suppresses the false negative predictions of existing LVLMs and improves Recall by approximately 0.07. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21368v1-abstract-full').style.display = 'none'; document.getElementById('2407.21368v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19487">arXiv:2407.19487</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.19487">pdf</a>, <a href="https://arxiv.org/format/2407.19487">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> RLCoder: Reinforcement Learning for Repository-Level Code Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanlin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yanli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiachi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Ruikai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Y">Yuchi Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19487v1-abstract-short" style="display: inline;"> Repository-level code completion aims to generate code for unfinished code snippets within the context of a specified repository. Existing approaches mainly rely on retrieval-augmented generation strategies due to limitations in input sequence length. However, traditional lexical-based retrieval methods like BM25 struggle to capture code semantics, while model-based retrieval methods face challeng&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19487v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19487v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19487v1-abstract-full" style="display: none;"> Repository-level code completion aims to generate code for unfinished code snippets within the context of a specified repository. Existing approaches mainly rely on retrieval-augmented generation strategies due to limitations in input sequence length. However, traditional lexical-based retrieval methods like BM25 struggle to capture code semantics, while model-based retrieval methods face challenges due to the lack of labeled data for training. Therefore, we propose RLCoder, a novel reinforcement learning framework, which can enable the retriever to learn to retrieve useful content for code completion without the need for labeled data. Specifically, we iteratively evaluate the usefulness of retrieved content based on the perplexity of the target code when provided with the retrieved content as additional context, and provide feedback to update the retriever parameters. This iterative process enables the retriever to learn from its successes and failures, gradually improving its ability to retrieve relevant and high-quality content. Considering that not all situations require information beyond code files and not all retrieved context is helpful for generation, we also introduce a stop signal mechanism, allowing the retriever to decide when to retrieve and which candidates to retain autonomously. Extensive experimental results demonstrate that RLCoder consistently outperforms state-of-the-art methods on CrossCodeEval and RepoEval, achieving 12.2% EM improvement over previous methods. Moreover, experiments show that our framework can generalize across different programming languages and further improve previous methods like RepoCoder. We provide the code and data at https://github.com/DeepSoftwareAnalytics/RLCoder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19487v1-abstract-full').style.display = 'none'; document.getElementById('2407.19487v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at ICSE 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 47th International Conference on Software Engineering (ICSE 2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15983">arXiv:2407.15983</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.15983">pdf</a>, <a href="https://arxiv.org/format/2407.15983">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> AoI, Timely-Throughput, and Beyond: A Theory of Second-Order Wireless Network Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daojing Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Nakhleh%2C+K">Khaled Nakhleh</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+I">I-Hong Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Kompella%2C+S">Sastry Kompella</a>, <a href="/search/cs?searchtype=author&amp;query=Kam%2C+C">Celement Kam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15983v1-abstract-short" style="display: inline;"> This paper introduces a new theoretical framework for optimizing second-order behaviors of wireless networks. Unlike existing techniques for network utility maximization, which only consider first-order statistics, this framework models every random process by its mean and temporal variance. The inclusion of temporal variance makes this framework well-suited for modeling Markovian fading wireless&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15983v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15983v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15983v1-abstract-full" style="display: none;"> This paper introduces a new theoretical framework for optimizing second-order behaviors of wireless networks. Unlike existing techniques for network utility maximization, which only consider first-order statistics, this framework models every random process by its mean and temporal variance. The inclusion of temporal variance makes this framework well-suited for modeling Markovian fading wireless channels and emerging network performance metrics such as age-of-information (AoI) and timely-throughput. Using this framework, we sharply characterize the second-order capacity region of wireless access networks. We also propose a simple scheduling policy and prove that it can achieve every interior point in the second-order capacity region. To demonstrate the utility of this framework, we apply it to an unsolved network optimization problem where some clients wish to minimize AoI while others wish to maximize timely-throughput. We show that this framework accurately characterizes AoI and timely-throughput. Moreover, it leads to a tractable scheduling policy that outperforms other existing work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15983v1-abstract-full').style.display = 'none'; document.getElementById('2407.15983v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in IEEE/ACM Transactions on Networking. arXiv admin note: substantial text overlap with arXiv:2201.06486</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08126">arXiv:2407.08126</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.08126">pdf</a>, <a href="https://arxiv.org/format/2407.08126">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Label-anticipated Event Disentanglement for Audio-Visual Video Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jinxing Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yuxin Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08126v1-abstract-short" style="display: inline;"> Audio-Visual Video Parsing (AVVP) task aims to detect and temporally locate events within audio and visual modalities. Multiple events can overlap in the timeline, making identification challenging. While traditional methods usually focus on improving the early audio-visual encoders to embed more effective features, the decoding phase -- crucial for final event classification, often receives less&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08126v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08126v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08126v1-abstract-full" style="display: none;"> Audio-Visual Video Parsing (AVVP) task aims to detect and temporally locate events within audio and visual modalities. Multiple events can overlap in the timeline, making identification challenging. While traditional methods usually focus on improving the early audio-visual encoders to embed more effective features, the decoding phase -- crucial for final event classification, often receives less attention. We aim to advance the decoding phase and improve its interpretability. Specifically, we introduce a new decoding paradigm, \underline{l}abel s\underline{e}m\underline{a}ntic-based \underline{p}rojection (LEAP), that employs labels texts of event categories, each bearing distinct and explicit semantics, for parsing potentially overlapping events.LEAP works by iteratively projecting encoded latent features of audio/visual segments onto semantically independent label embeddings. This process, enriched by modeling cross-modal (audio/visual-label) interactions, gradually disentangles event semantics within video segments to refine relevant label embeddings, guaranteeing a more discriminative and interpretable decoding process. To facilitate the LEAP paradigm, we propose a semantic-aware optimization strategy, which includes a novel audio-visual semantic similarity loss function. This function leverages the Intersection over Union of audio and visual events (EIoU) as a novel metric to calibrate audio-visual similarities at the feature level, accommodating the varied event densities across modalities. Extensive experiments demonstrate the superiority of our method, achieving new state-of-the-art performance for AVVP and also enhancing the relevant audio-visual event localization task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08126v1-abstract-full').style.display = 'none'; document.getElementById('2407.08126v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07510">arXiv:2407.07510</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07510">pdf</a>, <a href="https://arxiv.org/format/2407.07510">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3643832.3661854">10.1145/3643832.3661854 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Invisible Optical Adversarial Stripes on Traffic Sign against Autonomous Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dongfang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yuting Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yimin Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+P">Pengfei Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lou%2C+X">Xin Lou</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+R">Rui Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07510v1-abstract-short" style="display: inline;"> Camera-based computer vision is essential to autonomous vehicle&#39;s perception. This paper presents an attack that uses light-emitting diodes and exploits the camera&#39;s rolling shutter effect to create adversarial stripes in the captured images to mislead traffic sign recognition. The attack is stealthy because the stripes on the traffic sign are invisible to human. For the attack to be threatening,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07510v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07510v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07510v1-abstract-full" style="display: none;"> Camera-based computer vision is essential to autonomous vehicle&#39;s perception. This paper presents an attack that uses light-emitting diodes and exploits the camera&#39;s rolling shutter effect to create adversarial stripes in the captured images to mislead traffic sign recognition. The attack is stealthy because the stripes on the traffic sign are invisible to human. For the attack to be threatening, the recognition results need to be stable over consecutive image frames. To achieve this, we design and implement GhostStripe, an attack system that controls the timing of the modulated light emission to adapt to camera operations and victim vehicle movements. Evaluated on real testbeds, GhostStripe can stably spoof the traffic sign recognition results for up to 94\% of frames to a wrong class when the victim vehicle passes the road section. In reality, such attack effect may fool victim vehicles into life-threatening incidents. We discuss the countermeasures at the levels of camera sensor, perception model, and autonomous driving system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07510v1-abstract-full').style.display = 'none'; document.getElementById('2407.07510v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Proceedings of the 22nd Annual International Conference on Mobile Systems, Applications and Services (MobiSys 2024), 534-546 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05721">arXiv:2407.05721</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05721">pdf</a>, <a href="https://arxiv.org/format/2407.05721">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PsycoLLM: Enhancing LLM for Psychological Understanding and Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Jinpeng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+T">Tengteng Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Gang%2C+L">Luo Gang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+H">Hui Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+P">Peng Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+X">Xiao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05721v2-abstract-short" style="display: inline;"> Mental health has attracted substantial attention in recent years and LLM can be an effective technology for alleviating this problem owing to its capability in text understanding and dialogue. However, existing research in this domain often suffers from limitations, such as training on datasets lacking crucial prior knowledge and evidence, and the absence of comprehensive evaluation methods. In t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05721v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05721v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05721v2-abstract-full" style="display: none;"> Mental health has attracted substantial attention in recent years and LLM can be an effective technology for alleviating this problem owing to its capability in text understanding and dialogue. However, existing research in this domain often suffers from limitations, such as training on datasets lacking crucial prior knowledge and evidence, and the absence of comprehensive evaluation methods. In this paper, we propose a specialized psychological large language model (LLM), named PsycoLLM, trained on a proposed high-quality psychological dataset, including single-turn QA, multi-turn dialogues and knowledge-based QA. Specifically, we construct multi-turn dialogues through a three-step pipeline comprising generation, evidence judgment, and refinement. We augment this process with real-world psychological case backgrounds extracted from online platforms, enhancing the relevance and applicability of the generated data. Additionally, to compare the performance of PsycoLLM with other LLMs, we develop a comprehensive psychological benchmark based on authoritative psychological counseling examinations in China, which includes assessments of professional ethics, theoretical proficiency, and case analysis. The experimental results on the benchmark illustrates the effectiveness of PsycoLLM, which demonstrates superior performance compared to other LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05721v2-abstract-full').style.display = 'none'; document.getElementById('2407.05721v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05364">arXiv:2407.05364</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05364">pdf</a>, <a href="https://arxiv.org/format/2407.05364">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PTaRL: Prototype-based Tabular Representation Learning via Space Calibration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ye%2C+H">Hangting Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+W">Wei Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+X">Xiaozhuang Song</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+S">Shun Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">He Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dandan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+Y">Yi Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05364v2-abstract-short" style="display: inline;"> Tabular data have been playing a mostly important role in diverse real-world fields, such as healthcare, engineering, finance, etc. With the recent success of deep learning, many tabular machine learning (ML) methods based on deep networks (e.g., Transformer, ResNet) have achieved competitive performance on tabular benchmarks. However, existing deep tabular ML methods suffer from the representatio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05364v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05364v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05364v2-abstract-full" style="display: none;"> Tabular data have been playing a mostly important role in diverse real-world fields, such as healthcare, engineering, finance, etc. With the recent success of deep learning, many tabular machine learning (ML) methods based on deep networks (e.g., Transformer, ResNet) have achieved competitive performance on tabular benchmarks. However, existing deep tabular ML methods suffer from the representation entanglement and localization, which largely hinders their prediction performance and leads to performance inconsistency on tabular tasks. To overcome these problems, we explore a novel direction of applying prototype learning for tabular ML and propose a prototype-based tabular representation learning framework, PTaRL, for tabular prediction tasks. The core idea of PTaRL is to construct prototype-based projection space (P-Space) and learn the disentangled representation around global data prototypes. Specifically, PTaRL mainly involves two stages: (i) Prototype Generation, that constructs global prototypes as the basis vectors of P-Space for representation, and (ii) Prototype Projection, that projects the data samples into P-Space and keeps the core global data information via Optimal Transport. Then, to further acquire the disentangled representations, we constrain PTaRL with two strategies: (i) to diversify the coordinates towards global prototypes of different representations within P-Space, we bring up a diversification constraint for representation calibration; (ii) to avoid prototype entanglement in P-Space, we introduce a matrix orthogonalization constraint to ensure the independence of global prototypes. Finally, we conduct extensive experiments in PTaRL coupled with state-of-the-art deep tabular ML models on various tabular benchmarks and the results have shown our consistent superiority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05364v2-abstract-full').style.display = 'none'; document.getElementById('2407.05364v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05311">arXiv:2407.05311</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.05311">pdf</a>, <a href="https://arxiv.org/format/2407.05311">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMAD: Multi-label Micro-Action Detection in Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Pengyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guoliang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05311v1-abstract-short" style="display: inline;"> Human body actions are an important form of non-verbal communication in social interactions. This paper focuses on a specific subset of body actions known as micro-actions, which are subtle, low-intensity body movements that provide a deeper understanding of inner human feelings. In real-world scenarios, human micro-actions often co-occur, with multiple micro-actions overlapping in time, such as s&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05311v1-abstract-full').style.display = 'inline'; document.getElementById('2407.05311v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05311v1-abstract-full" style="display: none;"> Human body actions are an important form of non-verbal communication in social interactions. This paper focuses on a specific subset of body actions known as micro-actions, which are subtle, low-intensity body movements that provide a deeper understanding of inner human feelings. In real-world scenarios, human micro-actions often co-occur, with multiple micro-actions overlapping in time, such as simultaneous head and hand movements. However, current research primarily focuses on recognizing individual micro-actions while overlooking their co-occurring nature. To narrow this gap, we propose a new task named Multi-label Micro-Action Detection (MMAD), which involves identifying all micro-actions in a given short video, determining their start and end times, and categorizing them. Achieving this requires a model capable of accurately capturing both long-term and short-term action relationships to locate and classify multiple micro-actions. To support the MMAD task, we introduce a new dataset named Multi-label Micro-Action-52 (MMA-52), specifically designed to facilitate the detailed analysis and exploration of complex human micro-actions. The proposed MMA-52 dataset is available at: https://github.com/VUT-HFUT/Micro-Action. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05311v1-abstract-full').style.display = 'none'; document.getElementById('2407.05311v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04490">arXiv:2407.04490</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.04490">pdf</a>, <a href="https://arxiv.org/format/2407.04490">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Micro-gesture Online Recognition using Learnable Query Points </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Pengyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Guoliang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+Y">Yanyan Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shengeng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiliang Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04490v1-abstract-short" style="display: inline;"> In this paper, we briefly introduce the solution developed by our team, HFUT-VUT, for the Micro-gesture Online Recognition track in the MiGA challenge at IJCAI 2024. The Micro-gesture Online Recognition task involves identifying the category and locating the start and end times of micro-gestures in video clips. Compared to the typical Temporal Action Detection task, the Micro-gesture Online Recogn&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04490v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04490v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04490v1-abstract-full" style="display: none;"> In this paper, we briefly introduce the solution developed by our team, HFUT-VUT, for the Micro-gesture Online Recognition track in the MiGA challenge at IJCAI 2024. The Micro-gesture Online Recognition task involves identifying the category and locating the start and end times of micro-gestures in video clips. Compared to the typical Temporal Action Detection task, the Micro-gesture Online Recognition task focuses more on distinguishing between micro-gestures and pinpointing the start and end times of actions. Our solution ranks 2nd in the Micro-gesture Online Recognition track. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04490v1-abstract-full').style.display = 'none'; document.getElementById('2407.04490v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report of HFUT-VUT for the MiGA challenge at IJCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00046">arXiv:2407.00046</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.00046">pdf</a>, <a href="https://arxiv.org/format/2407.00046">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3687988">10.1145/3687988 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Barrier-Augmented Lagrangian for GPU-based Elastodynamic Contact </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dewen Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+M">Minchen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Guoping Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Sheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00046v1-abstract-short" style="display: inline;"> We propose a GPU-based iterative method for accelerated elastodynamic simulation with the log-barrier-based contact model. While Newton&#39;s method is a conventional choice for solving the interior-point system, the presence of ill-conditioned log barriers often necessitates a direct solution at each linearized substep and costs substantial storage and computational overhead. Moreover, constraint set&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00046v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00046v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00046v1-abstract-full" style="display: none;"> We propose a GPU-based iterative method for accelerated elastodynamic simulation with the log-barrier-based contact model. While Newton&#39;s method is a conventional choice for solving the interior-point system, the presence of ill-conditioned log barriers often necessitates a direct solution at each linearized substep and costs substantial storage and computational overhead. Moreover, constraint sets that vary in each iteration present additional challenges in algorithm convergence. Our method employs a novel barrier-augmented Lagrangian method to improve system conditioning and solver efficiency by adaptively updating an augmentation constraint sets. This enables the utilization of a scalable, inexact Newton-PCG solver with sparse GPU storage, eliminating the need for direct factorization. We further enhance PCG convergence speed with a domain-decomposed warm start strategy based on an eigenvalue spectrum approximated through our in-time assembly. Demonstrating significant scalability improvements, our method makes simulations previously impractical on 128 GB of CPU memory feasible with only 8 GB of GPU memory and orders-of-magnitude faster. Additionally, our method adeptly handles stiff problems, surpassing the capabilities of existing GPU-based interior-point methods. Our results, validated across various complex collision scenarios involving intricate geometries and large deformations, highlight the exceptional performance of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00046v1-abstract-full').style.display = 'none'; document.getElementById('2407.00046v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 30 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Transactions on Graphics, Vol. 43, No. 6, Article 225, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12224">arXiv:2406.12224</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.12224">pdf</a>, <a href="https://arxiv.org/format/2406.12224">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Large Language Model for Heterogeneous Ad Hoc Teamwork Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xinzhu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peiyan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+W">Wenju Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Di Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Huaping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12224v1-abstract-short" style="display: inline;"> Compared with the widely investigated homogeneous multi-robot collaboration, heterogeneous robots with different capabilities can provide a more efficient and flexible collaboration for more complex tasks. In this paper, we consider a more challenging heterogeneous ad hoc teamwork collaboration problem where an ad hoc robot joins an existing heterogeneous team for a shared goal. Specifically, the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12224v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12224v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12224v1-abstract-full" style="display: none;"> Compared with the widely investigated homogeneous multi-robot collaboration, heterogeneous robots with different capabilities can provide a more efficient and flexible collaboration for more complex tasks. In this paper, we consider a more challenging heterogeneous ad hoc teamwork collaboration problem where an ad hoc robot joins an existing heterogeneous team for a shared goal. Specifically, the ad hoc robot collaborates with unknown teammates without prior coordination, and it is expected to generate an appropriate cooperation policy to improve the efficiency of the whole team. To solve this challenging problem, we leverage the remarkable potential of the large language model (LLM) to establish a decentralized heterogeneous ad hoc teamwork collaboration framework that focuses on generating reasonable policy for an ad hoc robot to collaborate with original heterogeneous teammates. A training-free hierarchical dynamic planner is developed using the LLM together with the newly proposed Interactive Reflection of Thoughts (IRoT) method for the ad hoc agent to adapt to different teams. We also build a benchmark testing dataset to evaluate the proposed framework in the heterogeneous ad hoc multi-agent tidying-up task. Extensive comparison and ablation experiments are conducted in the benchmark to demonstrate the effectiveness of the proposed framework. We have also employed the proposed framework in physical robots in a real-world scenario. The experimental videos can be found at https://youtu.be/wHYP5T2WIp0. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12224v1-abstract-full').style.display = 'none'; document.getElementById('2406.12224v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11931">arXiv:2406.11931</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11931">pdf</a>, <a href="https://arxiv.org/format/2406.11931">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Y. Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yukun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+H">Huazuo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+W">Wangding Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Gu%2C+Z">Zihui Gu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hanwei Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+K">Kai Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Liyue Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Piao%2C+Y">Yishi Piao</a>, <a href="/search/cs?searchtype=author&amp;query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Z">Zhenda Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+Z">Zhewen Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Deli Chen</a> , et al. (15 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11931v1-abstract-short" style="display: inline;"> We present DeepSeek-Coder-V2, an open-source Mixture-of-Experts (MoE) code language model that achieves performance comparable to GPT4-Turbo in code-specific tasks. Specifically, DeepSeek-Coder-V2 is further pre-trained from an intermediate checkpoint of DeepSeek-V2 with additional 6 trillion tokens. Through this continued pre-training, DeepSeek-Coder-V2 substantially enhances the coding and mathe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11931v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11931v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11931v1-abstract-full" style="display: none;"> We present DeepSeek-Coder-V2, an open-source Mixture-of-Experts (MoE) code language model that achieves performance comparable to GPT4-Turbo in code-specific tasks. Specifically, DeepSeek-Coder-V2 is further pre-trained from an intermediate checkpoint of DeepSeek-V2 with additional 6 trillion tokens. Through this continued pre-training, DeepSeek-Coder-V2 substantially enhances the coding and mathematical reasoning capabilities of DeepSeek-V2, while maintaining comparable performance in general language tasks. Compared to DeepSeek-Coder-33B, DeepSeek-Coder-V2 demonstrates significant advancements in various aspects of code-related tasks, as well as reasoning and general capabilities. Additionally, DeepSeek-Coder-V2 expands its support for programming languages from 86 to 338, while extending the context length from 16K to 128K. In standard benchmark evaluations, DeepSeek-Coder-V2 achieves superior performance compared to closed-source models such as GPT4-Turbo, Claude 3 Opus, and Gemini 1.5 Pro in coding and math benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11931v1-abstract-full').style.display = 'none'; document.getElementById('2406.11931v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11429">arXiv:2406.11429</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11429">pdf</a>, <a href="https://arxiv.org/format/2406.11429">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fusion Makes Perfection: An Efficient Multi-Grained Matching Approach for Zero-Shot Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+G">Ge Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Ying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+C">Chenji Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daichi Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Ruifang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11429v1-abstract-short" style="display: inline;"> Predicting unseen relations that cannot be observed during the training phase is a challenging task in relation extraction. Previous works have made progress by matching the semantics between input instances and label descriptions. However, fine-grained matching often requires laborious manual annotation, and rich interactions between instances and label descriptions come with significant computat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11429v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11429v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11429v1-abstract-full" style="display: none;"> Predicting unseen relations that cannot be observed during the training phase is a challenging task in relation extraction. Previous works have made progress by matching the semantics between input instances and label descriptions. However, fine-grained matching often requires laborious manual annotation, and rich interactions between instances and label descriptions come with significant computational overhead. In this work, we propose an efficient multi-grained matching approach that uses virtual entity matching to reduce manual annotation cost, and fuses coarse-grained recall and fine-grained classification for rich interactions with guaranteed inference speed. Experimental results show that our approach outperforms the previous State Of The Art (SOTA) methods, and achieves a balance between inference efficiency and prediction accuracy in zero-shot relation extraction tasks. Our code is available at https://github.com/longls777/EMMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11429v1-abstract-full').style.display = 'none'; document.getElementById('2406.11429v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to the main conference of NAACL2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11266">arXiv:2406.11266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11266">pdf</a>, <a href="https://arxiv.org/ps/2406.11266">ps</a>, <a href="https://arxiv.org/format/2406.11266">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DRIP: Discriminative Rotation-Invariant Pole Landmark Descriptor for 3D LiDAR Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+D">Dingrui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dedi Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Tanaka%2C+K">Kanji Tanaka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11266v1-abstract-short" style="display: inline;"> In 3D LiDAR-based robot self-localization, pole-like landmarks are gaining popularity as lightweight and discriminative landmarks. This work introduces a novel approach called &#34;discriminative rotation-invariant poles,&#34; which enhances the discriminability of pole-like landmarks while maintaining their lightweight nature. Unlike conventional methods that model a pole landmark as a 3D line segment pe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11266v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11266v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11266v1-abstract-full" style="display: none;"> In 3D LiDAR-based robot self-localization, pole-like landmarks are gaining popularity as lightweight and discriminative landmarks. This work introduces a novel approach called &#34;discriminative rotation-invariant poles,&#34; which enhances the discriminability of pole-like landmarks while maintaining their lightweight nature. Unlike conventional methods that model a pole landmark as a 3D line segment perpendicular to the ground, we propose a simple yet powerful approach that includes not only the line segment&#39;s main body but also its surrounding local region of interest (ROI) as part of the pole landmark. Specifically, we describe the appearance, geometry, and semantic features within this ROI to improve the discriminability of the pole landmark. Since such pole landmarks are no longer rotation-invariant, we introduce a novel rotation-invariant convolutional neural network that automatically and efficiently extracts rotation-invariant features from input point clouds for recognition. Furthermore, we train a pole dictionary through unsupervised learning and use it to compress poles into compact pole words, thereby significantly reducing real-time costs while maintaining optimal self-localization performance. Monte Carlo localization experiments using publicly available NCLT dataset demonstrate that the proposed method improves a state-of-the-art pole-based localization framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11266v1-abstract-full').style.display = 'none'; document.getElementById('2406.11266v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11247">arXiv:2406.11247</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.11247">pdf</a>, <a href="https://arxiv.org/format/2406.11247">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> STEVE Series: Step-by-Step Construction of Agent Systems in Minecraft </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Z">Zhonghan Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+K">Ke Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+K">Kewei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dongxu Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+T">Tian Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yanting Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Gaoang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11247v1-abstract-short" style="display: inline;"> Building an embodied agent system with a large language model (LLM) as its core is a promising direction. Due to the significant costs and uncontrollable factors associated with deploying and training such agents in the real world, we have decided to begin our exploration within the Minecraft environment. Our STEVE Series agents can complete basic tasks in a virtual environment and more challengin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11247v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11247v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11247v1-abstract-full" style="display: none;"> Building an embodied agent system with a large language model (LLM) as its core is a promising direction. Due to the significant costs and uncontrollable factors associated with deploying and training such agents in the real world, we have decided to begin our exploration within the Minecraft environment. Our STEVE Series agents can complete basic tasks in a virtual environment and more challenging tasks such as navigation and even creative tasks, with an efficiency far exceeding previous state-of-the-art methods by a factor of $2.5\times$ to $7.3\times$. We begin our exploration with a vanilla large language model, augmenting it with a vision encoder and an action codebase trained on our collected high-quality dataset STEVE-21K. Subsequently, we enhanced it with a Critic and memory to transform it into a complex system. Finally, we constructed a hierarchical multi-agent system. Our recent work explored how to prune the agent system through knowledge distillation. In the future, we will explore more potential applications of STEVE agents in the real world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11247v1-abstract-full').style.display = 'none'; document.getElementById('2406.11247v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 Embodied AI Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09175">arXiv:2406.09175</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.09175">pdf</a>, <a href="https://arxiv.org/format/2406.09175">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ReMI: A Dataset for Reasoning with Multiple Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kazemi%2C+M">Mehran Kazemi</a>, <a href="/search/cs?searchtype=author&amp;query=Dikkala%2C+N">Nishanth Dikkala</a>, <a href="/search/cs?searchtype=author&amp;query=Anand%2C+A">Ankit Anand</a>, <a href="/search/cs?searchtype=author&amp;query=Devic%2C+P">Petar Devic</a>, <a href="/search/cs?searchtype=author&amp;query=Dasgupta%2C+I">Ishita Dasgupta</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Fangyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Fatemi%2C+B">Bahare Fatemi</a>, <a href="/search/cs?searchtype=author&amp;query=Awasthi%2C+P">Pranjal Awasthi</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dee Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Gollapudi%2C+S">Sreenivas Gollapudi</a>, <a href="/search/cs?searchtype=author&amp;query=Qureshi%2C+A">Ahmed Qureshi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09175v1-abstract-short" style="display: inline;"> With the continuous advancement of large language models (LLMs), it is essential to create new benchmarks to effectively evaluate their expanding capabilities and identify areas for improvement. This work focuses on multi-image reasoning, an emerging capability in state-of-the-art LLMs. We introduce ReMI, a dataset designed to assess LLMs&#39; ability to Reason with Multiple Images. This dataset encom&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09175v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09175v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09175v1-abstract-full" style="display: none;"> With the continuous advancement of large language models (LLMs), it is essential to create new benchmarks to effectively evaluate their expanding capabilities and identify areas for improvement. This work focuses on multi-image reasoning, an emerging capability in state-of-the-art LLMs. We introduce ReMI, a dataset designed to assess LLMs&#39; ability to Reason with Multiple Images. This dataset encompasses a diverse range of tasks, spanning various reasoning domains such as math, physics, logic, code, table/chart understanding, and spatial and temporal reasoning. It also covers a broad spectrum of characteristics found in multi-image reasoning scenarios. We have benchmarked several cutting-edge LLMs using ReMI and found a substantial gap between their performance and human-level proficiency. This highlights the challenges in multi-image reasoning and the need for further research. Our analysis also reveals the strengths and weaknesses of different models, shedding light on the types of reasoning that are currently attainable and areas where future models require improvement. To foster further research in this area, we are releasing ReMI publicly: https://huggingface.co/datasets/mehrankazemi/ReMI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09175v1-abstract-full').style.display = 'none'; document.getElementById('2406.09175v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07670">arXiv:2406.07670</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.07670">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Design and Control of a Compact Series Elastic Actuator Module for Robots in MRI Scanners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Binghan He</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+N">Naichen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D+Y">David Y. Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Paxson%2C+C+H">Charles H. Paxson</a>, <a href="/search/cs?searchtype=author&amp;query=Fearing%2C+R+S">Ronald S. Fearing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07670v1-abstract-short" style="display: inline;"> In this study, we introduce a novel MRI-compatible rotary series elastic actuator module utilizing velocity-sourced ultrasonic motors for force-controlled robots operating within MRI scanners. Unlike previous MRI-compatible SEA designs, our module incorporates a transmission force sensing series elastic actuator structure, with four off-the-shelf compression springs strategically placed between th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07670v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07670v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07670v1-abstract-full" style="display: none;"> In this study, we introduce a novel MRI-compatible rotary series elastic actuator module utilizing velocity-sourced ultrasonic motors for force-controlled robots operating within MRI scanners. Unlike previous MRI-compatible SEA designs, our module incorporates a transmission force sensing series elastic actuator structure, with four off-the-shelf compression springs strategically placed between the gearbox housing and the motor housing. This design features a compact size, thus expanding possibilities for a wider range of MRI robotic applications. To achieve precise torque control, we develop a controller that incorporates a disturbance observer tailored for velocity-sourced motors. This controller enhances the robustness of torque control in our actuator module, even in the presence of varying external impedance, thereby augmenting its suitability for MRI-guided medical interventions. Experimental validation demonstrates the actuator&#39;s torque control performance in both 3 Tesla MRI and non-MRI environments, achieving a settling time of 0.1 seconds and a steady-state error within 2% of its maximum output torque. Notably, our force controller exhibits consistent performance across low and high external impedance scenarios, in contrast to conventional controllers for velocity-sourced series elastic actuators, which struggle with steady-state performance under low external impedance conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07670v1-abstract-full').style.display = 'none'; document.getElementById('2406.07670v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06498">arXiv:2406.06498</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.06498">pdf</a>, <a href="https://arxiv.org/format/2406.06498">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Demonstrating HumanTHOR: A Simulation Platform and Benchmark for Human-Robot Collaboration in a Shared Workspace </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenxu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+B">Boyuan Du</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+J">Jiaxin Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peiyan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Di Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Huaping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06498v1-abstract-short" style="display: inline;"> Human-robot collaboration (HRC) in a shared workspace has become a common pattern in real-world robot applications and has garnered significant research interest. However, most existing studies for human-in-the-loop (HITL) collaboration with robots in a shared workspace evaluate in either simplified game environments or physical platforms, falling short in limited realistic significance or limited&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06498v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06498v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06498v1-abstract-full" style="display: none;"> Human-robot collaboration (HRC) in a shared workspace has become a common pattern in real-world robot applications and has garnered significant research interest. However, most existing studies for human-in-the-loop (HITL) collaboration with robots in a shared workspace evaluate in either simplified game environments or physical platforms, falling short in limited realistic significance or limited scalability. To support future studies, we build an embodied framework named HumanTHOR, which enables humans to act in the simulation environment through VR devices to support HITL collaborations in a shared workspace. To validate our system, we build a benchmark of everyday tasks and conduct a preliminary user study with two baseline algorithms. The results show that the robot can effectively assist humans in collaboration, demonstrating the significance of HRC. The comparison among different levels of baselines affirms that our system can adequately evaluate robot capabilities and serve as a benchmark for different robot algorithms. The experimental results also indicate that there is still much room in the area and our system can provide a preliminary foundation for future HRC research in a shared workspace. More information about the simulation environment, experiment videos, benchmark descriptions, and additional supplementary materials can be found on the website: https://sites.google.com/view/humanthor/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06498v1-abstract-full').style.display = 'none'; document.getElementById('2406.06498v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In RSS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04942">arXiv:2406.04942</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.04942">pdf</a>, <a href="https://arxiv.org/format/2406.04942">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Joint Spatial-Temporal Modeling and Contrastive Learning for Self-supervised Heart Rate Measurement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qian%2C+W">Wei Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinke Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+X">Xiao Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04942v1-abstract-short" style="display: inline;"> This paper briefly introduces the solutions developed by our team, HFUT-VUT, for Track 1 of self-supervised heart rate measurement in the 3rd Vision-based Remote Physiological Signal Sensing (RePSS) Challenge hosted at IJCAI 2024. The goal is to develop a self-supervised learning algorithm for heart rate (HR) estimation using unlabeled facial videos. To tackle this task, we present two self-superv&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04942v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04942v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04942v1-abstract-full" style="display: none;"> This paper briefly introduces the solutions developed by our team, HFUT-VUT, for Track 1 of self-supervised heart rate measurement in the 3rd Vision-based Remote Physiological Signal Sensing (RePSS) Challenge hosted at IJCAI 2024. The goal is to develop a self-supervised learning algorithm for heart rate (HR) estimation using unlabeled facial videos. To tackle this task, we present two self-supervised HR estimation solutions that integrate spatial-temporal modeling and contrastive learning, respectively. Specifically, we first propose a non-end-to-end self-supervised HR measurement framework based on spatial-temporal modeling, which can effectively capture subtle rPPG clues and leverage the inherent bandwidth and periodicity characteristics of rPPG to constrain the model. Meanwhile, we employ an excellent end-to-end solution based on contrastive learning, aiming to generalize across different scenarios from complementary perspectives. Finally, we combine the strengths of the above solutions through an ensemble strategy to generate the final predictions, leading to a more accurate HR estimation. As a result, our solutions achieved a remarkable RMSE score of 8.85277 on the test dataset, securing \textbf{2nd place} in Track 1 of the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04942v1-abstract-full').style.display = 'none'; document.getElementById('2406.04942v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02035">arXiv:2406.02035</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.02035">pdf</a>, <a href="https://arxiv.org/format/2406.02035">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Unifying Framework for Action-Conditional Self-Predictive Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Khetarpal%2C+K">Khimya Khetarpal</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+Z+D">Zhaohan Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Lyle%2C+C">Clare Lyle</a>, <a href="/search/cs?searchtype=author&amp;query=Rowland%2C+M">Mark Rowland</a>, <a href="/search/cs?searchtype=author&amp;query=Heess%2C+N">Nicolas Heess</a>, <a href="/search/cs?searchtype=author&amp;query=Borsa%2C+D">Diana Borsa</a>, <a href="/search/cs?searchtype=author&amp;query=Guez%2C+A">Arthur Guez</a>, <a href="/search/cs?searchtype=author&amp;query=Dabney%2C+W">Will Dabney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02035v1-abstract-short" style="display: inline;"> Learning a good representation is a crucial challenge for Reinforcement Learning (RL) agents. Self-predictive learning provides means to jointly learn a latent representation and dynamics model by bootstrapping from future latent representations (BYOL). Recent work has developed theoretical insights into these algorithms by studying a continuous-time ODE model for self-predictive representation le&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02035v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02035v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02035v1-abstract-full" style="display: none;"> Learning a good representation is a crucial challenge for Reinforcement Learning (RL) agents. Self-predictive learning provides means to jointly learn a latent representation and dynamics model by bootstrapping from future latent representations (BYOL). Recent work has developed theoretical insights into these algorithms by studying a continuous-time ODE model for self-predictive representation learning under the simplifying assumption that the algorithm depends on a fixed policy (BYOL-$螤$); this assumption is at odds with practical instantiations of such algorithms, which explicitly condition their predictions on future actions. In this work, we take a step towards bridging the gap between theory and practice by analyzing an action-conditional self-predictive objective (BYOL-AC) using the ODE framework, characterizing its convergence properties and highlighting important distinctions between the limiting solutions of the BYOL-$螤$ and BYOL-AC dynamics. We show how the two representations are related by a variance equation. This connection leads to a novel variance-like action-conditional objective (BYOL-VAR) and its corresponding ODE. We unify the study of all three objectives through two complementary lenses; a model-based perspective, where each objective is shown to be equivalent to a low-rank approximation of certain dynamics, and a model-free perspective, which establishes relationships between the objectives and their respective value, Q-value, and advantage function. Our empirical investigations, encompassing both linear function approximation and Deep RL environments, demonstrates that BYOL-AC is better overall in a variety of different settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02035v1-abstract-full').style.display = 'none'; document.getElementById('2406.02035v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00919">arXiv:2406.00919</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.00919">pdf</a>, <a href="https://arxiv.org/format/2406.00919">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Advancing Weakly-Supervised Audio-Visual Video Parsing via Segment-wise Pseudo Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jinxing Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00919v1-abstract-short" style="display: inline;"> The Audio-Visual Video Parsing task aims to identify and temporally localize the events that occur in either or both the audio and visual streams of audible videos. It often performs in a weakly-supervised manner, where only video event labels are provided, \ie, the modalities and the timestamps of the labels are unknown. Due to the lack of densely annotated labels, recent work attempts to leverag&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00919v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00919v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00919v1-abstract-full" style="display: none;"> The Audio-Visual Video Parsing task aims to identify and temporally localize the events that occur in either or both the audio and visual streams of audible videos. It often performs in a weakly-supervised manner, where only video event labels are provided, \ie, the modalities and the timestamps of the labels are unknown. Due to the lack of densely annotated labels, recent work attempts to leverage pseudo labels to enrich the supervision. A commonly used strategy is to generate pseudo labels by categorizing the known video event labels for each modality. However, the labels are still confined to the video level, and the temporal boundaries of events remain unlabeled. In this paper, we propose a new pseudo label generation strategy that can explicitly assign labels to each video segment by utilizing prior knowledge learned from the open world. Specifically, we exploit the large-scale pretrained models, namely CLIP and CLAP, to estimate the events in each video segment and generate segment-level visual and audio pseudo labels, respectively. We then propose a new loss function to exploit these pseudo labels by taking into account their category-richness and segment-richness. A label denoising strategy is also adopted to further improve the visual pseudo labels by flipping them whenever abnormally large forward losses occur. We perform extensive experiments on the LLP dataset and demonstrate the effectiveness of each proposed design and we achieve state-of-the-art video parsing performance on all types of event parsing, \ie, audio event, visual event, and audio-visual event. We also examine the proposed pseudo label generation strategy on a relevant weakly-supervised audio-visual event localization task and the experimental results again verify the benefits and generalization of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00919v1-abstract-full').style.display = 'none'; document.getElementById('2406.00919v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IJCV 2024 Accepted. arXiv admin note: substantial text overlap with arXiv:2303.02344</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19730">arXiv:2405.19730</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.19730">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Research on the Spatial Data Intelligent Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shaohua Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Danhuai Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zhi Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+Y">Yang Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+X">Xiao Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+F">Feng Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Huayi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Gui%2C+Z">Zhipeng Gui</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Z">Zhiming Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bolong Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fuzheng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jingyuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhengchao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+H">Hao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+P">Peng Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+Y">Yao Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Leilei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+L">Longbiao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xiaoping Du</a> , et al. (6 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19730v5-abstract-short" style="display: inline;"> This report focuses on spatial data intelligent large models, delving into the principles, methods, and cutting-edge applications of these models. It provides an in-depth discussion on the definition, development history, current status, and trends of spatial data intelligent large models, as well as the challenges they face. The report systematically elucidates the key technologies of spatial dat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19730v5-abstract-full').style.display = 'inline'; document.getElementById('2405.19730v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19730v5-abstract-full" style="display: none;"> This report focuses on spatial data intelligent large models, delving into the principles, methods, and cutting-edge applications of these models. It provides an in-depth discussion on the definition, development history, current status, and trends of spatial data intelligent large models, as well as the challenges they face. The report systematically elucidates the key technologies of spatial data intelligent large models and their applications in urban environments, aerospace remote sensing, geography, transportation, and other scenarios. Additionally, it summarizes the latest application cases of spatial data intelligent large models in themes such as urban development, multimodal systems, remote sensing, smart transportation, and resource environments. Finally, the report concludes with an overview and outlook on the development prospects of spatial data intelligent large models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19730v5-abstract-full').style.display = 'none'; document.getElementById('2405.19730v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">V1 and V2 are in Chinese language, other versions are in English</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19107">arXiv:2405.19107</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.19107">pdf</a>, <a href="https://arxiv.org/ps/2405.19107">ps</a>, <a href="https://arxiv.org/format/2405.19107">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Offline Regularised Reinforcement Learning for Large Language Models Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Richemond%2C+P+H">Pierre Harvey Richemond</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yunhao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daniel Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Calandriello%2C+D">Daniele Calandriello</a>, <a href="/search/cs?searchtype=author&amp;query=Azar%2C+M+G">Mohammad Gheshlaghi Azar</a>, <a href="/search/cs?searchtype=author&amp;query=Rafailov%2C+R">Rafael Rafailov</a>, <a href="/search/cs?searchtype=author&amp;query=Pires%2C+B+A">Bernardo Avila Pires</a>, <a href="/search/cs?searchtype=author&amp;query=Tarassov%2C+E">Eugene Tarassov</a>, <a href="/search/cs?searchtype=author&amp;query=Spangher%2C+L">Lucas Spangher</a>, <a href="/search/cs?searchtype=author&amp;query=Ellsworth%2C+W">Will Ellsworth</a>, <a href="/search/cs?searchtype=author&amp;query=Severyn%2C+A">Aliaksei Severyn</a>, <a href="/search/cs?searchtype=author&amp;query=Mallinson%2C+J">Jonathan Mallinson</a>, <a href="/search/cs?searchtype=author&amp;query=Shani%2C+L">Lior Shani</a>, <a href="/search/cs?searchtype=author&amp;query=Shamir%2C+G">Gil Shamir</a>, <a href="/search/cs?searchtype=author&amp;query=Joshi%2C+R">Rishabh Joshi</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianqi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Munos%2C+R">Remi Munos</a>, <a href="/search/cs?searchtype=author&amp;query=Piot%2C+B">Bilal Piot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19107v1-abstract-short" style="display: inline;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'inline'; document.getElementById('2405.19107v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19107v1-abstract-full" style="display: none;"> The dominant framework for alignment of large language models (LLM), whether through reinforcement learning from human feedback or direct preference optimisation, is to learn from preference data. This involves building datasets where each element is a quadruplet composed of a prompt, two independent responses (completions of the prompt) and a human preference between the two independent responses, yielding a preferred and a dis-preferred response. Such data is typically scarce and expensive to collect. On the other hand, \emph{single-trajectory} datasets where each element is a triplet composed of a prompt, a response and a human feedback is naturally more abundant. The canonical element of such datasets is for instance an LLM&#39;s response to a user&#39;s prompt followed by a user&#39;s feedback such as a thumbs-up/down. Consequently, in this work, we propose DRO, or \emph{Direct Reward Optimisation}, as a framework and associated algorithms that do not require pairwise preferences. DRO uses a simple mean-squared objective that can be implemented in various ways. We validate our findings empirically, using T5 encoder-decoder language models, and show DRO&#39;s performance over selected baselines such as Kahneman-Tversky Optimization (KTO). Thus, we confirm that DRO is a simple and empirically compelling method for single-trajectory policy optimisation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19107v1-abstract-full').style.display = 'none'; document.getElementById('2405.19107v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18300">arXiv:2405.18300</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.18300">pdf</a>, <a href="https://arxiv.org/format/2405.18300">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.24963/ijcai.2024/10">10.24963/ijcai.2024/10 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CompetEvo: Towards Morphological Evolution from Competition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+K">Kangyao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Di Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Huaping Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18300v1-abstract-short" style="display: inline;"> Training an agent to adapt to specific tasks through co-optimization of morphology and control has widely attracted attention. However, whether there exists an optimal configuration and tactics for agents in a multiagent competition scenario is still an issue that is challenging to definitively conclude. In this context, we propose competitive evolution (CompetEvo), which co-evolves agents&#39; design&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18300v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18300v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18300v1-abstract-full" style="display: none;"> Training an agent to adapt to specific tasks through co-optimization of morphology and control has widely attracted attention. However, whether there exists an optimal configuration and tactics for agents in a multiagent competition scenario is still an issue that is challenging to definitively conclude. In this context, we propose competitive evolution (CompetEvo), which co-evolves agents&#39; designs and tactics in confrontation. We build arenas consisting of three animals and their evolved derivatives, placing agents with different morphologies in direct competition with each other. The results reveal that our method enables agents to evolve a more suitable design and strategy for fighting compared to fixed-morph agents, allowing them to obtain advantages in combat scenarios. Moreover, we demonstrate the amazing and impressive behaviors that emerge when confrontations are conducted under asymmetrical morphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18300v1-abstract-full').style.display = 'none'; document.getElementById('2405.18300v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17903">arXiv:2405.17903</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.17903">pdf</a>, <a href="https://arxiv.org/format/2405.17903">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.neunet.2024.106493">10.1016/j.neunet.2024.106493 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Reliable Object Tracking by Multimodal Hybrid Feature Extraction and Transformer-Based Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hongze Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+W">Wuque Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yue Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+H">Huajin Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Y">Yan Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+D">Dezhong Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Daqing Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17903v1-abstract-short" style="display: inline;"> Visual object tracking, which is primarily based on visible light image sequences, encounters numerous challenges in complicated scenarios, such as low light conditions, high dynamic ranges, and background clutter. To address these challenges, incorporating the advantages of multiple visual modalities is a promising solution for achieving reliable object tracking. However, the existing approaches&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17903v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17903v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17903v1-abstract-full" style="display: none;"> Visual object tracking, which is primarily based on visible light image sequences, encounters numerous challenges in complicated scenarios, such as low light conditions, high dynamic ranges, and background clutter. To address these challenges, incorporating the advantages of multiple visual modalities is a promising solution for achieving reliable object tracking. However, the existing approaches usually integrate multimodal inputs through adaptive local feature interactions, which cannot leverage the full potential of visual cues, thus resulting in insufficient feature modeling. In this study, we propose a novel multimodal hybrid tracker (MMHT) that utilizes frame-event-based data for reliable single object tracking. The MMHT model employs a hybrid backbone consisting of an artificial neural network (ANN) and a spiking neural network (SNN) to extract dominant features from different visual modalities and then uses a unified encoder to align the features across different domains. Moreover, we propose an enhanced transformer-based module to fuse multimodal features using attention mechanisms. With these methods, the MMHT model can effectively construct a multiscale and multidimensional visual feature space and achieve discriminative feature modeling. Extensive experiments demonstrate that the MMHT model exhibits competitive performance in comparison with that of other state-of-the-art methods. Overall, our results highlight the effectiveness of the MMHT model in terms of addressing the challenges faced in visual object tracking tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17903v1-abstract-full').style.display = 'none'; document.getElementById('2405.17903v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 7 figures, 9 tabes; This work has been submitted for possible publication</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Guo%2C+D&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10