Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 385 results for author: <span class="mathjax">Chang, X</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chang%2C+X">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chang, X"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chang%2C+X&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chang, X"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chang%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12892">arXiv:2411.12892</a> <span> [<a href="https://arxiv.org/pdf/2411.12892">pdf</a>, <a href="https://arxiv.org/format/2411.12892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Selective Attention: Enhancing Transformer through Principled Context Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuechen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangyu Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingchen Li</a>, <a href="/search/cs?searchtype=author&query=Roy-Chowdhury%2C+A">Amit Roy-Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiasi Chen</a>, <a href="/search/cs?searchtype=author&query=Oymak%2C+S">Samet Oymak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12892v1-abstract-short" style="display: inline;"> The attention mechanism within the transformer architecture enables the model to weigh and combine tokens based on their relevance to the query. While self-attention has enjoyed major success, it notably treats all queries $q$ in the same way by applying the mapping $V^\top\text{softmax}(Kq)$, where $V,K$ are the value and key embeddings respectively. In this work, we argue that this uniform treat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12892v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12892v1-abstract-full" style="display: none;"> The attention mechanism within the transformer architecture enables the model to weigh and combine tokens based on their relevance to the query. While self-attention has enjoyed major success, it notably treats all queries $q$ in the same way by applying the mapping $V^\top\text{softmax}(Kq)$, where $V,K$ are the value and key embeddings respectively. In this work, we argue that this uniform treatment hinders the ability to control contextual sparsity and relevance. As a solution, we introduce the $\textit{Selective Self-Attention}$ (SSA) layer that augments the softmax nonlinearity with a principled temperature scaling strategy. By controlling temperature, SSA adapts the contextual sparsity of the attention map to the query embedding and its position in the context window. Through theory and experiments, we demonstrate that this alleviates attention dilution, aids the optimization process, and enhances the model's ability to control softmax spikiness of individual queries. We also incorporate temperature scaling for value embeddings and show that it boosts the model's ability to suppress irrelevant/noisy tokens. Notably, SSA is a lightweight method which introduces less than 0.5% new parameters through a weight-sharing strategy and can be fine-tuned on existing LLMs. Extensive empirical evaluations demonstrate that SSA-equipped models achieve a noticeable and consistent accuracy improvement on language modeling benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12892v1-abstract-full').style.display = 'none'; document.getElementById('2411.12892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11278">arXiv:2411.11278</a> <span> [<a href="https://arxiv.org/pdf/2411.11278">pdf</a>, <a href="https://arxiv.org/format/2411.11278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Towards Open-Vocabulary Audio-Visual Event Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinxing Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Ruohao Guo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yuxin Mao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingjing Hu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11278v1-abstract-short" style="display: inline;"> The Audio-Visual Event Localization (AVEL) task aims to temporally locate and classify video events that are both audible and visible. Most research in this field assumes a closed-set setting, which restricts these models' ability to handle test data containing event categories absent (unseen) during training. Recently, a few studies have explored AVEL in an open-set setting, enabling the recognit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11278v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11278v1-abstract-full" style="display: none;"> The Audio-Visual Event Localization (AVEL) task aims to temporally locate and classify video events that are both audible and visible. Most research in this field assumes a closed-set setting, which restricts these models' ability to handle test data containing event categories absent (unseen) during training. Recently, a few studies have explored AVEL in an open-set setting, enabling the recognition of unseen events as ``unknown'', but without providing category-specific semantics. In this paper, we advance the field by introducing the Open-Vocabulary Audio-Visual Event Localization (OV-AVEL) problem, which requires localizing audio-visual events and predicting explicit categories for both seen and unseen data at inference. To address this new task, we propose the OV-AVEBench dataset, comprising 24,800 videos across 67 real-life audio-visual scenes (seen:unseen = 46:21), each with manual segment-level annotation. We also establish three evaluation metrics for this task. Moreover, we investigate two baseline approaches, one training-free and one using a further fine-tuning paradigm. Specifically, we utilize the unified multimodal space from the pretrained ImageBind model to extract audio, visual, and textual (event classes) features. The training-free baseline then determines predictions by comparing the consistency of audio-text and visual-text feature similarities. The fine-tuning baseline incorporates lightweight temporal layers to encode temporal relations within the audio and visual modalities, using OV-AVEBench training data for model fine-tuning. We evaluate these baselines on the proposed OV-AVEBench dataset and discuss potential directions for future work in this new field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11278v1-abstract-full').style.display = 'none'; document.getElementById('2411.11278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://github.com/jasongief/OV-AVEL</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09884">arXiv:2411.09884</a> <span> [<a href="https://arxiv.org/pdf/2411.09884">pdf</a>, <a href="https://arxiv.org/format/2411.09884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Research on Domain-Specific Chinese Spelling Correction Method Based on Plugin Extension Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaowu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuan Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09884v1-abstract-short" style="display: inline;"> This paper proposes a Chinese spelling correction method based on plugin extension modules, aimed at addressing the limitations of existing models in handling domain-specific texts. Traditional Chinese spelling correction models are typically trained on general-domain datasets, resulting in poor performance when encountering specialized terminology in domain-specific texts. To address this issue,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09884v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09884v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09884v1-abstract-full" style="display: none;"> This paper proposes a Chinese spelling correction method based on plugin extension modules, aimed at addressing the limitations of existing models in handling domain-specific texts. Traditional Chinese spelling correction models are typically trained on general-domain datasets, resulting in poor performance when encountering specialized terminology in domain-specific texts. To address this issue, we design an extension module that learns the features of domain-specific terminology, thereby enhancing the model's correction capabilities within specific domains. This extension module can provide domain knowledge to the model without compromising its general spelling correction performance, thus improving its accuracy in specialized fields. Experimental results demonstrate that after integrating extension modules for medical, legal, and official document domains, the model's correction performance is significantly improved compared to the baseline model without any extension modules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09884v1-abstract-full').style.display = 'none'; document.getElementById('2411.09884v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07848">arXiv:2411.07848</a> <span> [<a href="https://arxiv.org/pdf/2411.07848">pdf</a>, <a href="https://arxiv.org/format/2411.07848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric VLN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Raychaudhuri%2C+S">Sonia Raychaudhuri</a>, <a href="/search/cs?searchtype=author&query=Ta%2C+D">Duy Ta</a>, <a href="/search/cs?searchtype=author&query=Ashton%2C+K">Katrina Ashton</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiuguang Wang</a>, <a href="/search/cs?searchtype=author&query=Bucher%2C+B">Bernadette Bucher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07848v1-abstract-short" style="display: inline;"> Landmark-based navigation (e.g. go to the wooden desk) and relative positional navigation (e.g. move 5 meters forward) are distinct navigation challenges solved very differently in existing robotics navigation methodology. We present a new dataset, OC-VLN, in order to distinctly evaluate grounding object-centric natural language navigation instructions in a method for performing landmark-based nav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07848v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07848v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07848v1-abstract-full" style="display: none;"> Landmark-based navigation (e.g. go to the wooden desk) and relative positional navigation (e.g. move 5 meters forward) are distinct navigation challenges solved very differently in existing robotics navigation methodology. We present a new dataset, OC-VLN, in order to distinctly evaluate grounding object-centric natural language navigation instructions in a method for performing landmark-based navigation. We also propose Natural Language grounded SLAM (NL-SLAM), a method to ground natural language instruction to robot observations and poses. We actively perform NL-SLAM in order to follow object-centric natural language navigation instructions. Our methods leverage pre-trained vision and language foundation models and require no task-specific training. We construct two strong baselines from state-of-the-art methods on related tasks, Object Goal Navigation and Vision Language Navigation, and we show that our approach, NL-SLAM, outperforms these baselines across all our metrics of success on OC-VLN. Finally, we successfully demonstrate the effectiveness of NL-SLAM for performing navigation instruction following in the real world on a Boston Dynamics Spot robot. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07848v1-abstract-full').style.display = 'none'; document.getElementById('2411.07848v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04925">arXiv:2411.04925</a> <span> [<a href="https://arxiv.org/pdf/2411.04925">pdf</a>, <a href="https://arxiv.org/format/2411.04925">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> StoryAgent: Customized Storytelling Video Generation via Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+P">Panwen Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jin Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+M">Mingfei Han</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+S">Shengcai Liao</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04925v2-abstract-short" style="display: inline;"> The advent of AI-Generated Content (AIGC) has spurred research into automated video generation to streamline conventional processes. However, automating storytelling video production, particularly for customized narratives, remains challenging due to the complexity of maintaining subject consistency across shots. While existing approaches like Mora and AesopAgent integrate multiple agents for Stor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04925v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04925v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04925v2-abstract-full" style="display: none;"> The advent of AI-Generated Content (AIGC) has spurred research into automated video generation to streamline conventional processes. However, automating storytelling video production, particularly for customized narratives, remains challenging due to the complexity of maintaining subject consistency across shots. While existing approaches like Mora and AesopAgent integrate multiple agents for Story-to-Video (S2V) generation, they fall short in preserving protagonist consistency and supporting Customized Storytelling Video Generation (CSVG). To address these limitations, we propose StoryAgent, a multi-agent framework designed for CSVG. StoryAgent decomposes CSVG into distinct subtasks assigned to specialized agents, mirroring the professional production process. Notably, our framework includes agents for story design, storyboard generation, video creation, agent coordination, and result evaluation. Leveraging the strengths of different models, StoryAgent enhances control over the generation process, significantly improving character consistency. Specifically, we introduce a customized Image-to-Video (I2V) method, LoRA-BE, to enhance intra-shot temporal consistency, while a novel storyboard generation pipeline is proposed to maintain subject consistency across shots. Extensive experiments demonstrate the effectiveness of our approach in synthesizing highly consistent storytelling videos, outperforming state-of-the-art methods. Our contributions include the introduction of StoryAgent, a versatile framework for video generation tasks, and novel techniques for preserving protagonist consistency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04925v2-abstract-full').style.display = 'none'; document.getElementById('2411.04925v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00388">arXiv:2411.00388</a> <span> [<a href="https://arxiv.org/pdf/2411.00388">pdf</a>, <a href="https://arxiv.org/format/2411.00388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Data Valuation via Asymmetric Data Shapley </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xi Zheng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangyu Chang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+R">Ruoxi Jia</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yong Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00388v2-abstract-short" style="display: inline;"> As data emerges as a vital driver of technological and economic advancements, a key challenge is accurately quantifying its value in algorithmic decision-making. The Shapley value, a well-established concept from cooperative game theory, has been widely adopted to assess the contribution of individual data sources in supervised machine learning. However, its symmetry axiom assumes all players in t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00388v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00388v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00388v2-abstract-full" style="display: none;"> As data emerges as a vital driver of technological and economic advancements, a key challenge is accurately quantifying its value in algorithmic decision-making. The Shapley value, a well-established concept from cooperative game theory, has been widely adopted to assess the contribution of individual data sources in supervised machine learning. However, its symmetry axiom assumes all players in the cooperative game are homogeneous, which overlooks the complex structures and dependencies present in real-world datasets. To address this limitation, we extend the traditional data Shapley framework to asymmetric data Shapley, making it flexible enough to incorporate inherent structures within the datasets for structure-aware data valuation. We also introduce an efficient $k$-nearest neighbor-based algorithm for its exact computation. We demonstrate the practical applicability of our framework across various machine learning tasks and data market contexts. The code is available at: https://github.com/xzheng01/Asymmetric-Data-Shapley. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00388v2-abstract-full').style.display = 'none'; document.getElementById('2411.00388v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23780">arXiv:2410.23780</a> <span> [<a href="https://arxiv.org/pdf/2410.23780">pdf</a>, <a href="https://arxiv.org/format/2410.23780">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Driving by the Rules: A Benchmark for Integrating Traffic Sign Regulations into Vectorized HD Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xinyuan Chang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Maixuan Xue</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinran Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zheng Pan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23780v1-abstract-short" style="display: inline;"> Ensuring adherence to traffic sign regulations is essential for both human and autonomous vehicle navigation. While current benchmark datasets concentrate on lane perception or basic traffic sign recognition, they often overlook the intricate task of integrating these regulations into lane operations. Addressing this gap, we introduce MapDR, a novel dataset designed for the extraction of Driving R… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23780v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23780v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23780v1-abstract-full" style="display: none;"> Ensuring adherence to traffic sign regulations is essential for both human and autonomous vehicle navigation. While current benchmark datasets concentrate on lane perception or basic traffic sign recognition, they often overlook the intricate task of integrating these regulations into lane operations. Addressing this gap, we introduce MapDR, a novel dataset designed for the extraction of Driving Rules from traffic signs and their association with vectorized, locally perceived HD Maps. MapDR features over 10,000 annotated video clips that capture the intricate correlation between traffic sign regulations and lanes. We define two pivotal sub-tasks: 1) Rule Extraction from Traffic Sign, which accurately deciphers regulatory instructions, and 2) Rule-Lane Correspondence Reasoning, which aligns these rules with their respective lanes. Built upon this benchmark, we provide a multimodal solution that offers a strong baseline for advancing autonomous driving technologies. It fills a critical gap in the integration of traffic sign rules, contributing to the development of reliable autonomous navigation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23780v1-abstract-full').style.display = 'none'; document.getElementById('2410.23780v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21967">arXiv:2410.21967</a> <span> [<a href="https://arxiv.org/pdf/2410.21967">pdf</a>, <a href="https://arxiv.org/format/2410.21967">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dual Conditional Diffusion Models for Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongtao Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chengkai Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wen Hu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lina Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21967v1-abstract-short" style="display: inline;"> Recent advancements in diffusion models have shown promising results in sequential recommendation (SR). However, current diffusion-based methods still exhibit two key limitations. First, they implicitly model the diffusion process for target item embeddings rather than the discrete target item itself, leading to inconsistency in the recommendation process. Second, existing methods rely on either i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21967v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21967v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21967v1-abstract-full" style="display: none;"> Recent advancements in diffusion models have shown promising results in sequential recommendation (SR). However, current diffusion-based methods still exhibit two key limitations. First, they implicitly model the diffusion process for target item embeddings rather than the discrete target item itself, leading to inconsistency in the recommendation process. Second, existing methods rely on either implicit or explicit conditional diffusion models, limiting their ability to fully capture the context of user behavior and leading to less robust target item embeddings. In this paper, we propose the Dual Conditional Diffusion Models for Sequential Recommendation (DCRec), introducing a discrete-to-continuous sequential recommendation diffusion framework. Our framework introduces a complete Markov chain to model the transition from the reversed target item representation to the discrete item index, bridging the discrete and continuous item spaces for diffusion models and ensuring consistency with the diffusion framework. Building on this framework, we present the Dual Conditional Diffusion Transformer (DCDT) that incorporates the implicit conditional and the explicit conditional for diffusion-based SR. Extensive experiments on public benchmark datasets demonstrate that DCRec outperforms state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21967v1-abstract-full').style.display = 'none'; document.getElementById('2410.21967v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19373">arXiv:2410.19373</a> <span> [<a href="https://arxiv.org/pdf/2410.19373">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> An Enhanced Hierarchical Planning Framework for Multi-Robot Autonomous Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+G">Gengyuan Cai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Luosong Guo</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangmao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19373v1-abstract-short" style="display: inline;"> The autonomous exploration of environments by multi-robot systems is a critical task with broad applications in rescue missions, exploration endeavors, and beyond. Current approaches often rely on either greedy frontier selection or end-to-end deep reinforcement learning (DRL) methods, yet these methods are frequently hampered by limitations such as short-sightedness, overlooking long-term implica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19373v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19373v1-abstract-full" style="display: none;"> The autonomous exploration of environments by multi-robot systems is a critical task with broad applications in rescue missions, exploration endeavors, and beyond. Current approaches often rely on either greedy frontier selection or end-to-end deep reinforcement learning (DRL) methods, yet these methods are frequently hampered by limitations such as short-sightedness, overlooking long-term implications, and convergence difficulties stemming from the intricate high-dimensional learning space. To address these challenges, this paper introduces an innovative integration strategy that combines the low-dimensional action space efficiency of frontier-based methods with the far-sightedness and optimality of DRL-based approaches. We propose a three-tiered planning framework that first identifies frontiers in free space, creating a sparse map representation that lightens data transmission burdens and reduces the DRL action space's dimensionality. Subsequently, we develop a multi-graph neural network (mGNN) that incorporates states of potential targets and robots, leveraging policy-based reinforcement learning to compute affinities, thereby superseding traditional heuristic utility values. Lastly, we implement local routing planning through subsequence search, which avoids exhaustive sequence traversal. Extensive validation across diverse scenarios and comprehensive simulation results demonstrate the effectiveness of our proposed method. Compared to baseline approaches, our framework achieves environmental exploration with fewer time steps and a notable reduction of over 30% in data transmission, showcasing its superiority in terms of efficiency and performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19373v1-abstract-full').style.display = 'none'; document.getElementById('2410.19373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.9; E.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16499">arXiv:2410.16499</a> <span> [<a href="https://arxiv.org/pdf/2410.16499">pdf</a>, <a href="https://arxiv.org/format/2410.16499">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SINGAPO: Single Image Controlled Generation of Articulated Parts in Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiayi Liu</a>, <a href="/search/cs?searchtype=author&query=Iliash%2C+D">Denys Iliash</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a>, <a href="/search/cs?searchtype=author&query=Savva%2C+M">Manolis Savva</a>, <a href="/search/cs?searchtype=author&query=Mahdavi-Amiri%2C+A">Ali Mahdavi-Amiri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16499v2-abstract-short" style="display: inline;"> We address the challenge of creating 3D assets for household articulated objects from a single image. Prior work on articulated object creation either requires multi-view multi-state input, or only allows coarse control over the generation process. These limitations hinder the scalability and practicality for articulated object modeling. In this work, we propose a method to generate articulated ob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16499v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16499v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16499v2-abstract-full" style="display: none;"> We address the challenge of creating 3D assets for household articulated objects from a single image. Prior work on articulated object creation either requires multi-view multi-state input, or only allows coarse control over the generation process. These limitations hinder the scalability and practicality for articulated object modeling. In this work, we propose a method to generate articulated objects from a single image. Observing the object in resting state from an arbitrary view, our method generates an articulated object that is visually consistent with the input image. To capture the ambiguity in part shape and motion posed by a single view of the object, we design a diffusion model that learns the plausible variations of objects in terms of geometry and kinematics. To tackle the complexity of generating structured data with attributes in multiple domains, we design a pipeline that produces articulated objects from high-level structure to geometric details in a coarse-to-fine manner, where we use a part connectivity graph and part abstraction as proxies. Our experiments show that our method outperforms the state-of-the-art in articulated object creation by a large margin in terms of the generated object realism, resemblance to the input image, and reconstruction quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16499v2-abstract-full').style.display = 'none'; document.getElementById('2410.16499v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://3dlg-hcvc.github.io/singapo</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15279">arXiv:2410.15279</a> <span> [<a href="https://arxiv.org/pdf/2410.15279">pdf</a>, <a href="https://arxiv.org/format/2410.15279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> ContextDet: Temporal Action Detection with Adaptive Context Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yun Xiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaopeng Peng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuanhong Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+D">Dingyi Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15279v1-abstract-short" style="display: inline;"> Temporal action detection (TAD), which locates and recognizes action segments, remains a challenging task in video understanding due to variable segment lengths and ambiguous boundaries. Existing methods treat neighboring contexts of an action segment indiscriminately, leading to imprecise boundary predictions. We introduce a single-stage ContextDet framework, which makes use of large-kernel convo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15279v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15279v1-abstract-full" style="display: none;"> Temporal action detection (TAD), which locates and recognizes action segments, remains a challenging task in video understanding due to variable segment lengths and ambiguous boundaries. Existing methods treat neighboring contexts of an action segment indiscriminately, leading to imprecise boundary predictions. We introduce a single-stage ContextDet framework, which makes use of large-kernel convolutions in TAD for the first time. Our model features a pyramid adaptive context aggragation (ACA) architecture, capturing long context and improving action discriminability. Each ACA level consists of two novel modules. The context attention module (CAM) identifies salient contextual information, encourages context diversity, and preserves context integrity through a context gating block (CGB). The long context module (LCM) makes use of a mixture of large- and small-kernel convolutions to adaptively gather long-range context and fine-grained local features. Additionally, by varying the length of these large kernels across the ACA pyramid, our model provides lightweight yet effective context aggregation and action discrimination. We conducted extensive experiments and compared our model with a number of advanced TAD methods on six challenging TAD benchmarks: MultiThumos, Charades, FineAction, EPIC-Kitchens 100, Thumos14, and HACS, demonstrating superior accuracy at reduced inference speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15279v1-abstract-full').style.display = 'none'; document.getElementById('2410.15279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10790">arXiv:2410.10790</a> <span> [<a href="https://arxiv.org/pdf/2410.10790">pdf</a>, <a href="https://arxiv.org/format/2410.10790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sitcom-Crafter: A Plot-Driven Human Motion Generation System in 3D Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+P">Panwen Hu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zhenwei Shi</a>, <a href="/search/cs?searchtype=author&query=Kampffmeyer%2C+M+C">Michael Christian Kampffmeyer</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10790v1-abstract-short" style="display: inline;"> Recent advancements in human motion synthesis have focused on specific types of motions, such as human-scene interaction, locomotion or human-human interaction, however, there is a lack of a unified system capable of generating a diverse combination of motion types. In response, we introduce Sitcom-Crafter, a comprehensive and extendable system for human motion generation in 3D space, which can be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10790v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10790v1-abstract-full" style="display: none;"> Recent advancements in human motion synthesis have focused on specific types of motions, such as human-scene interaction, locomotion or human-human interaction, however, there is a lack of a unified system capable of generating a diverse combination of motion types. In response, we introduce Sitcom-Crafter, a comprehensive and extendable system for human motion generation in 3D space, which can be guided by extensive plot contexts to enhance workflow efficiency for anime and game designers. The system is comprised of eight modules, three of which are dedicated to motion generation, while the remaining five are augmentation modules that ensure consistent fusion of motion sequences and system functionality. Central to the generation modules is our novel 3D scene-aware human-human interaction module, which addresses collision issues by synthesizing implicit 3D Signed Distance Function (SDF) points around motion spaces, thereby minimizing human-scene collisions without additional data collection costs. Complementing this, our locomotion and human-scene interaction modules leverage existing methods to enrich the system's motion generation capabilities. Augmentation modules encompass plot comprehension for command generation, motion synchronization for seamless integration of different motion types, hand pose retrieval to enhance motion realism, motion collision revision to prevent human collisions, and 3D retargeting to ensure visual fidelity. Experimental evaluations validate the system's ability to generate high-quality, diverse, and physically realistic motions, underscoring its potential for advancing creative workflows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10790v1-abstract-full').style.display = 'none'; document.getElementById('2410.10790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code Page: https://github.com/WindVChen/Sitcom-Crafter</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00350">arXiv:2410.00350</a> <span> [<a href="https://arxiv.org/pdf/2410.00350">pdf</a>, <a href="https://arxiv.org/format/2410.00350">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Efficient Training of Large Vision Models via Advanced Automated Progressive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Changlin Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Sihao Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Junwei Liang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00350v1-abstract-short" style="display: inline;"> The rapid advancements in Large Vision Models (LVMs), such as Vision Transformers (ViTs) and diffusion models, have led to an increasing demand for computational resources, resulting in substantial financial and environmental costs. This growing challenge highlights the necessity of developing efficient training methods for LVMs. Progressive learning, a training strategy in which model capacity gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00350v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00350v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00350v1-abstract-full" style="display: none;"> The rapid advancements in Large Vision Models (LVMs), such as Vision Transformers (ViTs) and diffusion models, have led to an increasing demand for computational resources, resulting in substantial financial and environmental costs. This growing challenge highlights the necessity of developing efficient training methods for LVMs. Progressive learning, a training strategy in which model capacity gradually increases during training, has shown potential in addressing these challenges. In this paper, we present an advanced automated progressive learning (AutoProg) framework for efficient training of LVMs. We begin by focusing on the pre-training of LVMs, using ViTs as a case study, and propose AutoProg-One, an AutoProg scheme featuring momentum growth (MoGrow) and a one-shot growth schedule search. Beyond pre-training, we extend our approach to tackle transfer learning and fine-tuning of LVMs. We expand the scope of AutoProg to cover a wider range of LVMs, including diffusion models. First, we introduce AutoProg-Zero, by enhancing the AutoProg framework with a novel zero-shot unfreezing schedule search, eliminating the need for one-shot supernet training. Second, we introduce a novel Unique Stage Identifier (SID) scheme to bridge the gap during network growth. These innovations, integrated with the core principles of AutoProg, offer a comprehensive solution for efficient training across various LVM scenarios. Extensive experiments show that AutoProg accelerates ViT pre-training by up to 1.85x on ImageNet and accelerates fine-tuning of diffusion models by up to 2.86x, with comparable or even higher performance. This work provides a robust and scalable approach to efficient training of LVMs, with potential applications in a wide range of vision tasks. Code: https://github.com/changlin31/AutoProg-Zero <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00350v1-abstract-full').style.display = 'none'; document.getElementById('2410.00350v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code: https://github.com/changlin31/AutoProg-Zero. arXiv admin note: substantial text overlap with arXiv:2203.14509</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18896">arXiv:2409.18896</a> <span> [<a href="https://arxiv.org/pdf/2409.18896">pdf</a>, <a href="https://arxiv.org/format/2409.18896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> S2O: Static to Openable Enhancement for Articulated 3D Objects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Iliash%2C+D">Denys Iliash</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanxiao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Savva%2C+M">Manolis Savva</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18896v1-abstract-short" style="display: inline;"> Despite much progress in large 3D datasets there are currently few interactive 3D object datasets, and their scale is limited due to the manual effort required in their construction. We introduce the static to openable (S2O) task which creates interactive articulated 3D objects from static counterparts through openable part detection, motion prediction, and interior geometry completion. We formula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18896v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18896v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18896v1-abstract-full" style="display: none;"> Despite much progress in large 3D datasets there are currently few interactive 3D object datasets, and their scale is limited due to the manual effort required in their construction. We introduce the static to openable (S2O) task which creates interactive articulated 3D objects from static counterparts through openable part detection, motion prediction, and interior geometry completion. We formulate a unified framework to tackle this task, and curate a challenging dataset of openable 3D objects that serves as a test bed for systematic evaluation. Our experiments benchmark methods from prior work and simple yet effective heuristics for the S2O task. We find that turning static 3D objects into interactively openable counterparts is possible but that all methods struggle to generalize to realistic settings of the task, and we highlight promising future work directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18896v1-abstract-full').style.display = 'none'; document.getElementById('2409.18896v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17566">arXiv:2409.17566</a> <span> [<a href="https://arxiv.org/pdf/2409.17566">pdf</a>, <a href="https://arxiv.org/format/2409.17566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Flexiffusion: Segment-wise Neural Architecture Search for Flexible Denoising Schedule </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hongtao Huang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lina Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17566v1-abstract-short" style="display: inline;"> Diffusion models are cutting-edge generative models adept at producing diverse, high-quality images. Despite their effectiveness, these models often require significant computational resources owing to their numerous sequential denoising steps and the significant inference cost of each step. Recently, Neural Architecture Search (NAS) techniques have been employed to automatically search for faster… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17566v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17566v1-abstract-full" style="display: none;"> Diffusion models are cutting-edge generative models adept at producing diverse, high-quality images. Despite their effectiveness, these models often require significant computational resources owing to their numerous sequential denoising steps and the significant inference cost of each step. Recently, Neural Architecture Search (NAS) techniques have been employed to automatically search for faster generation processes. However, NAS for diffusion is inherently time-consuming as it requires estimating thousands of diffusion models to search for the optimal one. In this paper, we introduce Flexiffusion, a novel training-free NAS paradigm designed to accelerate diffusion models by concurrently optimizing generation steps and network structures. Specifically, we partition the generation process into isometric step segments, each sequentially composed of a full step, multiple partial steps, and several null steps. The full step computes all network blocks, while the partial step involves part of the blocks, and the null step entails no computation. Flexiffusion autonomously explores flexible step combinations for each segment, substantially reducing search costs and enabling greater acceleration compared to the state-of-the-art (SOTA) method for diffusion models. Our searched models reported speedup factors of $2.6\times$ and $1.5\times$ for the original LDM-4-G and the SOTA, respectively. The factors for Stable Diffusion V1.5 and the SOTA are $5.1\times$ and $2.0\times$. We also verified the performance of Flexiffusion on multiple datasets, and positive experiment results indicate that Flexiffusion can effectively reduce redundancy in diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17566v1-abstract-full').style.display = 'none'; document.getElementById('2409.17566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15088">arXiv:2409.15088</a> <span> [<a href="https://arxiv.org/pdf/2409.15088">pdf</a>, <a href="https://arxiv.org/format/2409.15088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> AdapFair: Ensuring Continuous Fairness for Machine Learning Operations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yinghui Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zihao Tang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangyu Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15088v1-abstract-short" style="display: inline;"> The biases and discrimination of machine learning algorithms have attracted significant attention, leading to the development of various algorithms tailored to specific contexts. However, these solutions often fall short of addressing fairness issues inherent in machine learning operations. In this paper, we present a debiasing framework designed to find an optimal fair transformation of input dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15088v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15088v1-abstract-full" style="display: none;"> The biases and discrimination of machine learning algorithms have attracted significant attention, leading to the development of various algorithms tailored to specific contexts. However, these solutions often fall short of addressing fairness issues inherent in machine learning operations. In this paper, we present a debiasing framework designed to find an optimal fair transformation of input data that maximally preserves data predictability. A distinctive feature of our approach is its flexibility and efficiency. It can be integrated with any downstream black-box classifiers, providing continuous fairness guarantees with minimal retraining efforts, even in the face of frequent data drifts, evolving fairness requirements, and batches of similar tasks. To achieve this, we leverage the normalizing flows to enable efficient, information-preserving data transformation, ensuring that no critical information is lost during the debiasing process. Additionally, we incorporate the Wasserstein distance as the unfairness measure to guide the optimization of data transformations. Finally, we introduce an efficient optimization algorithm with closed-formed gradient computations, making our framework scalable and suitable for dynamic, real-world environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15088v1-abstract-full').style.display = 'none'; document.getElementById('2409.15088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages,15 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14039">arXiv:2409.14039</a> <span> [<a href="https://arxiv.org/pdf/2409.14039">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Towards Lightweight and Privacy-preserving Data Provision in Digital Forensics for Driverless Taxi </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yanwei Gong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaolin Chang</a>, <a href="/search/cs?searchtype=author&query=Mi%C5%A1i%C4%87%2C+J">Jelena Mi拧i膰</a>, <a href="/search/cs?searchtype=author&query=Mi%C5%A1i%C4%87%2C+V+B">Vojislav B. Mi拧i膰</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Junchao Fan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaiwen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14039v1-abstract-short" style="display: inline;"> Data provision, referring to the data upload and data access, is one key phase in vehicular digital forensics. The unique features of Driverless Taxi (DT) bring new issues to this phase: 1) efficient verification of data integrity when diverse Data Providers (DPs) upload data; 2) DP privacy preservation during data upload; and 3) privacy preservation of both data and INvestigator (IN) under comple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14039v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14039v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14039v1-abstract-full" style="display: none;"> Data provision, referring to the data upload and data access, is one key phase in vehicular digital forensics. The unique features of Driverless Taxi (DT) bring new issues to this phase: 1) efficient verification of data integrity when diverse Data Providers (DPs) upload data; 2) DP privacy preservation during data upload; and 3) privacy preservation of both data and INvestigator (IN) under complex data ownership when accessing data. To this end, we propose a novel Lightweight and Privacy-preserving Data Provision (LPDP) approach consisting of three mechanisms: 1) the Privacy-friendly Batch Verification Mechanism (PBVm) based on elliptic curve cryptography, 2) Data Access Control Mechanism (DACm) based on ciphertext-policy attribute-based encryption, and 3) Decentralized IN Warrant Issuance Mechanism (DIWIm) based on secret sharing. Privacy preservation of data provision is achieved through: 1) ensuring the DP privacy preservation in terms of the location privacy and unlinkability of data upload requests by PBVm, 2) ensuring data privacy preservation by DACm and DIWIm, and 3) ensuring the identity privacy of IN in terms of the anonymity and unlinkability of data access requests without sacrificing the traceability. Lightweight of data provision is achieved through: 1) ensuring scalable verification of data integrity by PBVm, and 2) ensuring low-overhead warrant update with respect to DIWIm. Security analysis and performance evaluation are conducted to validate the security and performance features of LPDP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14039v1-abstract-full').style.display = 'none'; document.getElementById('2409.14039v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13275">arXiv:2409.13275</a> <span> [<a href="https://arxiv.org/pdf/2409.13275">pdf</a>, <a href="https://arxiv.org/format/2409.13275">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Margin Global Classifier for Exemplar-Free Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+Z">Zhongren Yao</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaobin Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13275v1-abstract-short" style="display: inline;"> Exemplar-free class-incremental learning (EFCIL) presents a significant challenge as the old class samples are absent for new task learning. Due to the severe imbalance between old and new class samples, the learned classifiers can be easily biased toward the new ones. Moreover, continually updating the feature extractor under EFCIL can compromise the discriminative power of old class features, e.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13275v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13275v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13275v1-abstract-full" style="display: none;"> Exemplar-free class-incremental learning (EFCIL) presents a significant challenge as the old class samples are absent for new task learning. Due to the severe imbalance between old and new class samples, the learned classifiers can be easily biased toward the new ones. Moreover, continually updating the feature extractor under EFCIL can compromise the discriminative power of old class features, e.g., leading to less compact and more overlapping distributions across classes. Existing methods mainly focus on handling biased classifier learning. In this work, both cases are considered using the proposed method. Specifically, we first introduce a Distribution-Based Global Classifier (DBGC) to avoid bias factors in existing methods, such as data imbalance and sampling. More importantly, the compromised distributions of old classes are simulated via a simple operation, variance enlarging (VE). Incorporating VE based on DBGC results in a novel classification loss for EFCIL. This loss is proven equivalent to an Adaptive Margin Softmax Cross Entropy (AMarX). The proposed method is thus called Adaptive Margin Global Classifier (AMGC). AMGC is simple yet effective. Extensive experiments show that AMGC achieves superior image classification results on its own under a challenging EFCIL setting. Detailed analysis is also provided for further demonstration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13275v1-abstract-full').style.display = 'none'; document.getElementById('2409.13275v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12370">arXiv:2409.12370</a> <span> [<a href="https://arxiv.org/pdf/2409.12370">pdf</a>, <a href="https://arxiv.org/format/2409.12370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Robust Audiovisual Speech Recognition Models with Mixture-of-Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yichen Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+R">Ruihua Song</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12370v1-abstract-short" style="display: inline;"> Visual signals can enhance audiovisual speech recognition accuracy by providing additional contextual information. Given the complexity of visual signals, an audiovisual speech recognition model requires robust generalization capabilities across diverse video scenarios, presenting a significant challenge. In this paper, we introduce EVA, leveraging the mixture-of-Experts for audioVisual ASR to per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12370v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12370v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12370v1-abstract-full" style="display: none;"> Visual signals can enhance audiovisual speech recognition accuracy by providing additional contextual information. Given the complexity of visual signals, an audiovisual speech recognition model requires robust generalization capabilities across diverse video scenarios, presenting a significant challenge. In this paper, we introduce EVA, leveraging the mixture-of-Experts for audioVisual ASR to perform robust speech recognition for ``in-the-wild'' videos. Specifically, we first encode visual information into visual tokens sequence and map them into speech space by a lightweight projection. Then, we build EVA upon a robust pretrained speech recognition model, ensuring its generalization ability. Moreover, to incorporate visual information effectively, we inject visual information into the ASR model through a mixture-of-experts module. Experiments show our model achieves state-of-the-art results on three benchmarks, which demonstrates the generalization ability of EVA across diverse video domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12370v1-abstract-full').style.display = 'none'; document.getElementById('2409.12370v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures, accepted by IEEE Spoken Language Technology Workshop 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09888">arXiv:2409.09888</a> <span> [<a href="https://arxiv.org/pdf/2409.09888">pdf</a>, <a href="https://arxiv.org/ps/2409.09888">ps</a>, <a href="https://arxiv.org/format/2409.09888">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Flexible Diffusion Scopes with Parameterized Laplacian for Heterophilic Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qincheng Lu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09888v1-abstract-short" style="display: inline;"> The ability of Graph Neural Networks (GNNs) to capture long-range and global topology information is limited by the scope of conventional graph Laplacian, leading to unsatisfactory performance on some datasets, particularly on heterophilic graphs. To address this limitation, we propose a new class of parameterized Laplacian matrices, which provably offers more flexibility in controlling the diffus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09888v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09888v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09888v1-abstract-full" style="display: none;"> The ability of Graph Neural Networks (GNNs) to capture long-range and global topology information is limited by the scope of conventional graph Laplacian, leading to unsatisfactory performance on some datasets, particularly on heterophilic graphs. To address this limitation, we propose a new class of parameterized Laplacian matrices, which provably offers more flexibility in controlling the diffusion distance between nodes than the conventional graph Laplacian, allowing long-range information to be adaptively captured through diffusion on graph. Specifically, we first prove that the diffusion distance and spectral distance on graph have an order-preserving relationship. With this result, we demonstrate that the parameterized Laplacian can accelerate the diffusion of long-range information, and the parameters in the Laplacian enable flexibility of the diffusion scopes. Based on the theoretical results, we propose topology-guided rewiring mechanism to capture helpful long-range neighborhood information for heterophilic graphs. With this mechanism and the new Laplacian, we propose two GNNs with flexible diffusion scopes: namely the Parameterized Diffusion based Graph Convolutional Networks (PD-GCN) and Graph Attention Networks (PD-GAT). Synthetic experiments reveal the high correlations between the parameters of the new Laplacian and the performance of parameterized GNNs under various graph homophily levels, which verifies that our new proposed GNNs indeed have the ability to adjust the parameters to adaptively capture the global information for different levels of heterophilic graphs. They also outperform the state-of-the-art (SOTA) models on 6 out of 7 real-world benchmark datasets, which further confirms their superiority. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09888v1-abstract-full').style.display = 'none'; document.getElementById('2409.09888v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2403.01475</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05755">arXiv:2409.05755</a> <span> [<a href="https://arxiv.org/pdf/2409.05755">pdf</a>, <a href="https://arxiv.org/ps/2409.05755">ps</a>, <a href="https://arxiv.org/format/2409.05755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Are Heterophily-Specific GNNs and Homophily Metrics Really Effective? Evaluation Pitfalls and New Benchmarks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qincheng Lu</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05755v1-abstract-short" style="display: inline;"> Over the past decade, Graph Neural Networks (GNNs) have achieved great success on machine learning tasks with relational data. However, recent studies have found that heterophily can cause significant performance degradation of GNNs, especially on node-level tasks. Numerous heterophilic benchmark datasets have been put forward to validate the efficacy of heterophily-specific GNNs and various homop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05755v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05755v1-abstract-full" style="display: none;"> Over the past decade, Graph Neural Networks (GNNs) have achieved great success on machine learning tasks with relational data. However, recent studies have found that heterophily can cause significant performance degradation of GNNs, especially on node-level tasks. Numerous heterophilic benchmark datasets have been put forward to validate the efficacy of heterophily-specific GNNs and various homophily metrics have been designed to help people recognize these malignant datasets. Nevertheless, there still exist multiple pitfalls that severely hinder the proper evaluation of new models and metrics. In this paper, we point out three most serious pitfalls: 1) a lack of hyperparameter tuning; 2) insufficient model evaluation on the real challenging heterophilic datasets; 3) missing quantitative evaluation benchmark for homophily metrics on synthetic graphs. To overcome these challenges, we first train and fine-tune baseline models on $27$ most widely used benchmark datasets, categorize them into three distinct groups: malignant, benign and ambiguous heterophilic datasets, and identify the real challenging subsets of tasks. To our best knowledge, we are the first to propose such taxonomy. Then, we re-evaluate $10$ heterophily-specific state-of-the-arts (SOTA) GNNs with fine-tuned hyperparameters on different groups of heterophilic datasets. Based on the model performance, we reassess their effectiveness on addressing heterophily challenge. At last, we evaluate $11$ popular homophily metrics on synthetic graphs with three different generation approaches. To compare the metrics strictly, we propose the first quantitative evaluation method based on Fr茅chet distance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05755v1-abstract-full').style.display = 'none'; document.getElementById('2409.05755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2407.09618</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05352">arXiv:2409.05352</a> <span> [<a href="https://arxiv.org/pdf/2409.05352">pdf</a>, <a href="https://arxiv.org/format/2409.05352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Driving with Prior Maps: Unified Vector Prior Encoding for Autonomous Vehicle Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+S">Shuang Zeng</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xinyuan Chang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinran Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zheng Pan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05352v2-abstract-short" style="display: inline;"> High-Definition Maps (HD maps) are essential for the precise navigation and decision-making of autonomous vehicles, yet their creation and upkeep present significant cost and timeliness challenges. The online construction of HD maps using on-board sensors has emerged as a promising solution; however, these methods can be impeded by incomplete data due to occlusions and inclement weather. This pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05352v2-abstract-full').style.display = 'inline'; document.getElementById('2409.05352v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05352v2-abstract-full" style="display: none;"> High-Definition Maps (HD maps) are essential for the precise navigation and decision-making of autonomous vehicles, yet their creation and upkeep present significant cost and timeliness challenges. The online construction of HD maps using on-board sensors has emerged as a promising solution; however, these methods can be impeded by incomplete data due to occlusions and inclement weather. This paper proposes the PriorDrive framework to addresses these limitations by harnessing the power of prior maps, significantly enhancing the robustness and accuracy of online HD map construction. Our approach integrates a variety of prior maps, such as OpenStreetMap's Standard Definition Maps (SD maps), outdated HD maps from vendors, and locally constructed maps from historical vehicle data. To effectively encode this prior information into online mapping models, we introduce a Hybrid Prior Representation (HPQuery) that standardizes the representation of diverse map elements. At the core of PriorDrive is the Unified Vector Encoder (UVE), which employs a dual encoding mechanism to process vector data. The intra-vector encoder captures fine-grained local features, while the inter-vector encoder integrates global context. Furthermore, we propose a segment-level and point-level pre-training strategy that enables the UVE to learn the prior distribution of vector data, thereby improving the encoder's generalizability and performance. Through extensive testing on the nuScenes dataset, we demonstrate that PriorDrive is highly compatible with various online mapping models and substantially improves map prediction capabilities. The integration of prior maps through the PriorDrive framework offers a robust solution to the challenges of single-perception data, paving the way for more reliable autonomous vehicle navigation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05352v2-abstract-full').style.display = 'none'; document.getElementById('2409.05352v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00250">arXiv:2409.00250</a> <span> [<a href="https://arxiv.org/pdf/2409.00250">pdf</a>, <a href="https://arxiv.org/format/2409.00250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Medical Report Generation Is A Multi-label Classification Problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yijian Fan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenbang Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00250v1-abstract-short" style="display: inline;"> Medical report generation is a critical task in healthcare that involves the automatic creation of detailed and accurate descriptions from medical images. Traditionally, this task has been approached as a sequence generation problem, relying on vision-and-language techniques to generate coherent and contextually relevant reports. However, in this paper, we propose a novel perspective: rethinking m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00250v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00250v1-abstract-full" style="display: none;"> Medical report generation is a critical task in healthcare that involves the automatic creation of detailed and accurate descriptions from medical images. Traditionally, this task has been approached as a sequence generation problem, relying on vision-and-language techniques to generate coherent and contextually relevant reports. However, in this paper, we propose a novel perspective: rethinking medical report generation as a multi-label classification problem. By framing the task this way, we leverage the radiology nodes from the commonly used knowledge graph, which can be better captured through classification techniques. To verify our argument, we introduce a novel report generation framework based on BLIP integrated with classified key nodes, which allows for effective report generation with accurate classification of multiple key aspects within the medical images. This approach not only simplifies the report generation process but also significantly enhances performance metrics. Our extensive experiments demonstrate that leveraging key nodes can achieve state-of-the-art (SOTA) performance, surpassing existing approaches across two benchmark datasets. The results underscore the potential of re-envisioning traditional tasks with innovative methodologies, paving the way for more efficient and accurate medical report generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00250v1-abstract-full').style.display = 'none'; document.getElementById('2409.00250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2024 IEEE International Conference on Medical Artificial Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09744">arXiv:2408.09744</a> <span> [<a href="https://arxiv.org/pdf/2408.09744">pdf</a>, <a href="https://arxiv.org/format/2408.09744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RealCustom++: Representing Images as Real-Word for Real-Time Customization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Z">Zhendong Mao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Mengqi Huang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+F">Fei Ding</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingcong Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qian He</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongdong Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09744v1-abstract-short" style="display: inline;"> Text-to-image customization, which takes given texts and images depicting given subjects as inputs, aims to synthesize new images that align with both text semantics and subject appearance. This task provides precise control over details that text alone cannot capture and is fundamental for various real-world applications, garnering significant interest from academia and industry. Existing works f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09744v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09744v1-abstract-full" style="display: none;"> Text-to-image customization, which takes given texts and images depicting given subjects as inputs, aims to synthesize new images that align with both text semantics and subject appearance. This task provides precise control over details that text alone cannot capture and is fundamental for various real-world applications, garnering significant interest from academia and industry. Existing works follow the pseudo-word paradigm, which involves representing given subjects as pseudo-words and combining them with given texts to collectively guide the generation. However, the inherent conflict and entanglement between the pseudo-words and texts result in a dual-optimum paradox, where subject similarity and text controllability cannot be optimal simultaneously. We propose a novel real-words paradigm termed RealCustom++ that instead represents subjects as non-conflict real words, thereby disentangling subject similarity from text controllability and allowing both to be optimized simultaneously. Specifically, RealCustom++ introduces a novel "train-inference" decoupled framework: (1) During training, RealCustom++ learns the alignment between vision conditions and all real words in the text, ensuring high subject-similarity generation in open domains. This is achieved by the cross-layer cross-scale projector to robustly and finely extract subject features, and a curriculum training recipe that adapts the generated subject to diverse poses and sizes. (2) During inference, leveraging the learned general alignment, an adaptive mask guidance is proposed to only customize the generation of the specific target real word, keeping other subject-irrelevant regions uncontaminated to ensure high text-controllability in real-time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09744v1-abstract-full').style.display = 'none'; document.getElementById('2408.09744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06019">arXiv:2408.06019</a> <span> [<a href="https://arxiv.org/pdf/2408.06019">pdf</a>, <a href="https://arxiv.org/format/2408.06019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HeadGAP: Few-shot 3D Head Avatar via Generalizable Gaussian Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiaozheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chao Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaohu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zhuo Su</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xu Chang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zheng Lv</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guidong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06019v1-abstract-short" style="display: inline;"> In this paper, we present a novel 3D head avatar creation approach capable of generalizing from few-shot in-the-wild data with high-fidelity and animatable robustness. Given the underconstrained nature of this problem, incorporating prior knowledge is essential. Therefore, we propose a framework comprising prior learning and avatar creation phases. The prior learning phase leverages 3D head priors… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06019v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06019v1-abstract-full" style="display: none;"> In this paper, we present a novel 3D head avatar creation approach capable of generalizing from few-shot in-the-wild data with high-fidelity and animatable robustness. Given the underconstrained nature of this problem, incorporating prior knowledge is essential. Therefore, we propose a framework comprising prior learning and avatar creation phases. The prior learning phase leverages 3D head priors derived from a large-scale multi-view dynamic dataset, and the avatar creation phase applies these priors for few-shot personalization. Our approach effectively captures these priors by utilizing a Gaussian Splatting-based auto-decoder network with part-based dynamic modeling. Our method employs identity-shared encoding with personalized latent codes for individual identities to learn the attributes of Gaussian primitives. During the avatar creation phase, we achieve fast head avatar personalization by leveraging inversion and fine-tuning strategies. Extensive experiments demonstrate that our model effectively exploits head priors and successfully generalizes them to few-shot personalization, achieving photo-realistic rendering quality, multi-view consistency, and stable animation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06019v1-abstract-full').style.display = 'none'; document.getElementById('2408.06019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://headgap.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.05503">arXiv:2408.05503</a> <span> [<a href="https://arxiv.org/pdf/2408.05503">pdf</a>, <a href="https://arxiv.org/format/2408.05503">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Disentangled Noisy Correspondence Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dang%2C+Z">Zhuohang Dang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+M">Minnan Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jihong Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Chengyou Jia</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Haochen Han</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Herun Wan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Guang Dai</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingdong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.05503v1-abstract-short" style="display: inline;"> Cross-modal retrieval is crucial in understanding latent correspondences across modalities. However, existing methods implicitly assume well-matched training data, which is impractical as real-world data inevitably involves imperfect alignments, i.e., noisy correspondences. Although some works explore similarity-based strategies to address such noise, they suffer from sub-optimal similarity predic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05503v1-abstract-full').style.display = 'inline'; document.getElementById('2408.05503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.05503v1-abstract-full" style="display: none;"> Cross-modal retrieval is crucial in understanding latent correspondences across modalities. However, existing methods implicitly assume well-matched training data, which is impractical as real-world data inevitably involves imperfect alignments, i.e., noisy correspondences. Although some works explore similarity-based strategies to address such noise, they suffer from sub-optimal similarity predictions influenced by modality-exclusive information (MEI), e.g., background noise in images and abstract definitions in texts. This issue arises as MEI is not shared across modalities, thus aligning it in training can markedly mislead similarity predictions. Moreover, although intuitive, directly applying previous cross-modal disentanglement methods suffers from limited noise tolerance and disentanglement efficacy. Inspired by the robustness of information bottlenecks against noise, we introduce DisNCL, a novel information-theoretic framework for feature Disentanglement in Noisy Correspondence Learning, to adaptively balance the extraction of MII and MEI with certifiable optimal cross-modal disentanglement efficacy. DisNCL then enhances similarity predictions in modality-invariant subspace, thereby greatly boosting similarity-based alleviation strategy for noisy correspondences. Furthermore, DisNCL introduces soft matching targets to model noisy many-to-many relationships inherent in multi-modal input for noise-robust and accurate cross-modal alignment. Extensive experiments confirm DisNCL's efficacy by 2% average recall improvement. Mutual information estimation and visualization results show that DisNCL learns meaningful MII/MEI subspaces, validating our theoretical analyses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.05503v1-abstract-full').style.display = 'none'; document.getElementById('2408.05503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03178">arXiv:2408.03178</a> <span> [<a href="https://arxiv.org/pdf/2408.03178">pdf</a>, <a href="https://arxiv.org/format/2408.03178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Object is Worth 64x64 Pixels: Generating 3D Object via Image Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xingguang Yan</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Han-Hung Lee</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03178v1-abstract-short" style="display: inline;"> We introduce a new approach for generating realistic 3D models with UV maps through a representation termed "Object Images." This approach encapsulates surface geometry, appearance, and patch structures within a 64x64 pixel image, effectively converting complex 3D shapes into a more manageable 2D format. By doing so, we address the challenges of both geometric and semantic irregularity inherent in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03178v1-abstract-full').style.display = 'inline'; document.getElementById('2408.03178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03178v1-abstract-full" style="display: none;"> We introduce a new approach for generating realistic 3D models with UV maps through a representation termed "Object Images." This approach encapsulates surface geometry, appearance, and patch structures within a 64x64 pixel image, effectively converting complex 3D shapes into a more manageable 2D format. By doing so, we address the challenges of both geometric and semantic irregularity inherent in polygonal meshes. This method allows us to use image generation models, such as Diffusion Transformers, directly for 3D shape generation. Evaluated on the ABO dataset, our generated shapes with patch structures achieve point cloud FID comparable to recent 3D generative models, while naturally supporting PBR material generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03178v1-abstract-full').style.display = 'none'; document.getElementById('2408.03178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://omages.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02211">arXiv:2408.02211</a> <span> [<a href="https://arxiv.org/pdf/2408.02211">pdf</a>, <a href="https://arxiv.org/format/2408.02211">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> SceneMotifCoder: Example-driven Visual Program Learning for Generating 3D Object Arrangements </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tam%2C+H+I+I">Hou In Ivan Tam</a>, <a href="/search/cs?searchtype=author&query=Pun%2C+H+I+D">Hou In Derek Pun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A+T">Austin T. Wang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a>, <a href="/search/cs?searchtype=author&query=Savva%2C+M">Manolis Savva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02211v1-abstract-short" style="display: inline;"> Despite advances in text-to-3D generation methods, generation of multi-object arrangements remains challenging. Current methods exhibit failures in generating physically plausible arrangements that respect the provided text description. We present SceneMotifCoder (SMC), an example-driven framework for generating 3D object arrangements through visual program learning. SMC leverages large language m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02211v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02211v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02211v1-abstract-full" style="display: none;"> Despite advances in text-to-3D generation methods, generation of multi-object arrangements remains challenging. Current methods exhibit failures in generating physically plausible arrangements that respect the provided text description. We present SceneMotifCoder (SMC), an example-driven framework for generating 3D object arrangements through visual program learning. SMC leverages large language models (LLMs) and program synthesis to overcome these challenges by learning visual programs from example arrangements. These programs are generalized into compact, editable meta-programs. When combined with 3D object retrieval and geometry-aware optimization, they can be used to create object arrangements varying in arrangement structure and contained objects. Our experiments show that SMC generates high-quality arrangements using meta-programs learned from few examples. Evaluation results demonstrates that object arrangements generated by SMC better conform to user-specified text descriptions and are more physically plausible when compared with state-of-the-art text-to-3D generation and layout methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02211v1-abstract-full').style.display = 'none'; document.getElementById('2408.02211v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00624">arXiv:2408.00624</a> <span> [<a href="https://arxiv.org/pdf/2408.00624">pdf</a>, <a href="https://arxiv.org/format/2408.00624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SynesLM: A Unified Approach for Audio-visual Speech Recognition and Translation via Language Model and Synthetic Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yichen Lu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiaqi Song</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+H">Hengwei Bian</a>, <a href="/search/cs?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00624v1-abstract-short" style="display: inline;"> In this work, we present SynesLM, an unified model which can perform three multimodal language understanding tasks: audio-visual automatic speech recognition(AV-ASR) and visual-aided speech/machine translation(VST/VMT). Unlike previous research that focused on lip motion as visual cues for speech signals, our work explores more general visual information within entire frames, such as objects and a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00624v1-abstract-full').style.display = 'inline'; document.getElementById('2408.00624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00624v1-abstract-full" style="display: none;"> In this work, we present SynesLM, an unified model which can perform three multimodal language understanding tasks: audio-visual automatic speech recognition(AV-ASR) and visual-aided speech/machine translation(VST/VMT). Unlike previous research that focused on lip motion as visual cues for speech signals, our work explores more general visual information within entire frames, such as objects and actions. Additionally, we use synthetic image data to enhance the correlation between image and speech data. We benchmark SynesLM against the How2 dataset, demonstrating performance on par with state-of-the-art (SOTA) models dedicated to AV-ASR while maintaining our multitasking framework. Remarkably, for zero-shot AV-ASR, SynesLM achieved SOTA performance by lowering the Word Error Rate (WER) from 43.4% to 39.4% on the VisSpeech Dataset. Furthermore, our results in VST and VMT outperform the previous results, improving the BLEU score to 43.5 from 37.2 for VST, and to 54.8 from 54.4 for VMT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00624v1-abstract-full').style.display = 'none'; document.getElementById('2408.00624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20906">arXiv:2407.20906</a> <span> [<a href="https://arxiv.org/pdf/2407.20906">pdf</a>, <a href="https://arxiv.org/format/2407.20906">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> Automated Review Generation Method Based on Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shican Wu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiao Ma</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dehui Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lulu Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xiangcheng Shi</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xin Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaoyun Lin</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+R">Ran Luo</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+C">Chunlei Pei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zhi-Jian Zhao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jinlong Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20906v1-abstract-short" style="display: inline;"> Literature research, vital for scientific advancement, is overwhelmed by the vast ocean of available information. Addressing this, we propose an automated review generation method based on Large Language Models (LLMs) to streamline literature processing and reduce cognitive load. In case study on propane dehydrogenation (PDH) catalysts, our method swiftly generated comprehensive reviews from 343 a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20906v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20906v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20906v1-abstract-full" style="display: none;"> Literature research, vital for scientific advancement, is overwhelmed by the vast ocean of available information. Addressing this, we propose an automated review generation method based on Large Language Models (LLMs) to streamline literature processing and reduce cognitive load. In case study on propane dehydrogenation (PDH) catalysts, our method swiftly generated comprehensive reviews from 343 articles, averaging seconds per article per LLM account. Extended analysis of 1041 articles provided deep insights into catalysts' composition, structure, and performance. Recognizing LLMs' hallucinations, we employed a multi-layered quality control strategy, ensuring our method's reliability and effective hallucination mitigation. Expert verification confirms the accuracy and citation integrity of generated reviews, demonstrating LLM hallucination risks reduced to below 0.5% with over 95% confidence. Released Windows application enables one-click review generation, aiding researchers in tracking advancements and recommending literature. This approach showcases LLMs' role in enhancing scientific research productivity and sets the stage for further exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20906v1-abstract-full').style.display = 'none'; document.getElementById('2407.20906v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 3 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19497">arXiv:2407.19497</a> <span> [<a href="https://arxiv.org/pdf/2407.19497">pdf</a>, <a href="https://arxiv.org/format/2407.19497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Skeleton-based Group Activity Recognition via Spatial-Temporal Panoramic Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengcen Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xinle Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yueran Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jingyong Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19497v1-abstract-short" style="display: inline;"> Group Activity Recognition aims to understand collective activities from videos. Existing solutions primarily rely on the RGB modality, which encounters challenges such as background variations, occlusions, motion blurs, and significant computational overhead. Meanwhile, current keypoint-based methods offer a lightweight and informative representation of human motions but necessitate accurate indi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19497v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19497v1-abstract-full" style="display: none;"> Group Activity Recognition aims to understand collective activities from videos. Existing solutions primarily rely on the RGB modality, which encounters challenges such as background variations, occlusions, motion blurs, and significant computational overhead. Meanwhile, current keypoint-based methods offer a lightweight and informative representation of human motions but necessitate accurate individual annotations and specialized interaction reasoning modules. To address these limitations, we design a panoramic graph that incorporates multi-person skeletons and objects to encapsulate group activity, offering an effective alternative to RGB video. This panoramic graph enables Graph Convolutional Network (GCN) to unify intra-person, inter-person, and person-object interactive modeling through spatial-temporal graph convolutions. In practice, we develop a novel pipeline that extracts skeleton coordinates using pose estimation and tracking algorithms and employ Multi-person Panoramic GCN (MP-GCN) to predict group activities. Extensive experiments on Volleyball and NBA datasets demonstrate that the MP-GCN achieves state-of-the-art performance in both accuracy and efficiency. Notably, our method outperforms RGB-based approaches by using only estimated 2D keypoints as input. Code is available at https://github.com/mgiant/MP-GCN <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19497v1-abstract-full').style.display = 'none'; document.getElementById('2407.19497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19373">arXiv:2407.19373</a> <span> [<a href="https://arxiv.org/pdf/2407.19373">pdf</a>, <a href="https://arxiv.org/format/2407.19373">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Uncertainty Quantification of Data Shapley via Statistical Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mengmeng Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhihong Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+R">Ruoxi Jia</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangyu Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19373v1-abstract-short" style="display: inline;"> As data plays an increasingly pivotal role in decision-making, the emergence of data markets underscores the growing importance of data valuation. Within the machine learning landscape, Data Shapley stands out as a widely embraced method for data valuation. However, a limitation of Data Shapley is its assumption of a fixed dataset, contrasting with the dynamic nature of real-world applications whe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19373v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19373v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19373v1-abstract-full" style="display: none;"> As data plays an increasingly pivotal role in decision-making, the emergence of data markets underscores the growing importance of data valuation. Within the machine learning landscape, Data Shapley stands out as a widely embraced method for data valuation. However, a limitation of Data Shapley is its assumption of a fixed dataset, contrasting with the dynamic nature of real-world applications where data constantly evolves and expands. This paper establishes the relationship between Data Shapley and infinite-order U-statistics and addresses this limitation by quantifying the uncertainty of Data Shapley with changes in data distribution from the perspective of U-statistics. We make statistical inferences on data valuation to obtain confidence intervals for the estimations. We construct two different algorithms to estimate this uncertainty and provide recommendations for their applicable situations. We also conduct a series of experiments on various datasets to verify asymptotic normality and propose a practical trading scenario enabled by this method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19373v1-abstract-full').style.display = 'none'; document.getElementById('2407.19373v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16447">arXiv:2407.16447</a> <span> [<a href="https://arxiv.org/pdf/2407.16447">pdf</a>, <a href="https://arxiv.org/ps/2407.16447">ps</a>, <a href="https://arxiv.org/format/2407.16447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cornell%2C+S">Samuele Cornell</a>, <a href="/search/cs?searchtype=author&query=Park%2C+T">Taejin Park</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Steve Huang</a>, <a href="/search/cs?searchtype=author&query=Boeddeker%2C+C">Christoph Boeddeker</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Maciejewski%2C+M">Matthew Maciejewski</a>, <a href="/search/cs?searchtype=author&query=Wiesner%2C+M">Matthew Wiesner</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+P">Paola Garcia</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16447v1-abstract-short" style="display: inline;"> This paper presents the CHiME-8 DASR challenge which carries on from the previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It focuses on joint multi-channel distant speech recognition (DASR) and diarization with one or more, possibly heterogeneous, devices. The main goal is to spur research towards meeting transcription approaches that can generalize across arbitrary number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16447v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16447v1-abstract-full" style="display: none;"> This paper presents the CHiME-8 DASR challenge which carries on from the previous edition CHiME-7 DASR (C7DASR) and the past CHiME-6 challenge. It focuses on joint multi-channel distant speech recognition (DASR) and diarization with one or more, possibly heterogeneous, devices. The main goal is to spur research towards meeting transcription approaches that can generalize across arbitrary number of speakers, diverse settings (formal vs. informal conversations), meeting duration, wide-variety of acoustic scenarios and different recording configurations. Novelties with respect to C7DASR include: i) the addition of NOTSOFAR-1, an additional office/corporate meeting scenario, ii) a manually corrected Mixer 6 development set, iii) a new track in which we allow the use of large-language models (LLM) iv) a jury award mechanism to encourage participants to explore also more practical and innovative solutions. To lower the entry barrier for participants, we provide a standalone toolkit for downloading and preparing such datasets as well as performing text normalization and scoring their submissions. Furthermore, this year we also provide two baseline systems, one directly inherited from C7DASR and based on ESPnet and another one developed on NeMo and based on NeMo team submission in last year C7DASR. Baseline system results suggest that the addition of the NOTSOFAR-1 scenario significantly increases the task's difficulty due to its high number of speakers and very short duration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16447v1-abstract-full').style.display = 'none'; document.getElementById('2407.16447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14474">arXiv:2407.14474</a> <span> [<a href="https://arxiv.org/pdf/2407.14474">pdf</a>, <a href="https://arxiv.org/format/2407.14474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Learning with Counterfactual Explanations for Radiology Report Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haokun Lin</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Liang Qiu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Elsaddik%2C+A">Abdulmotaleb Elsaddik</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14474v1-abstract-short" style="display: inline;"> Due to the common content of anatomy, radiology images with their corresponding reports exhibit high similarity. Such inherent data bias can predispose automatic report generation models to learn entangled and spurious representations resulting in misdiagnostic reports. To tackle these, we propose a novel \textbf{Co}unter\textbf{F}actual \textbf{E}xplanations-based framework (CoFE) for radiology r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14474v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14474v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14474v1-abstract-full" style="display: none;"> Due to the common content of anatomy, radiology images with their corresponding reports exhibit high similarity. Such inherent data bias can predispose automatic report generation models to learn entangled and spurious representations resulting in misdiagnostic reports. To tackle these, we propose a novel \textbf{Co}unter\textbf{F}actual \textbf{E}xplanations-based framework (CoFE) for radiology report generation. Counterfactual explanations serve as a potent tool for understanding how decisions made by algorithms can be changed by asking ``what if'' scenarios. By leveraging this concept, CoFE can learn non-spurious visual representations by contrasting the representations between factual and counterfactual images. Specifically, we derive counterfactual images by swapping a patch between positive and negative samples until a predicted diagnosis shift occurs. Here, positive and negative samples are the most semantically similar but have different diagnosis labels. Additionally, CoFE employs a learnable prompt to efficiently fine-tune the pre-trained large language model, encapsulating both factual and counterfactual content to provide a more generalizable prompt representation. Extensive experiments on two benchmarks demonstrate that leveraging the counterfactual explanations enables CoFE to generate semantically coherent and factually complete reports and outperform in terms of language generation and clinical efficacy metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14474v1-abstract-full').style.display = 'none'; document.getElementById('2407.14474v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12170">arXiv:2407.12170</a> <span> [<a href="https://arxiv.org/pdf/2407.12170">pdf</a>, <a href="https://arxiv.org/format/2407.12170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3626772.3657765">10.1145/3626772.3657765 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Neural Passage Quality Estimation for Static Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuejun Chang</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+D">Debabrata Mishra</a>, <a href="/search/cs?searchtype=author&query=Macdonald%2C+C">Craig Macdonald</a>, <a href="/search/cs?searchtype=author&query=MacAvaney%2C+S">Sean MacAvaney</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12170v1-abstract-short" style="display: inline;"> Neural networks -- especially those that use large, pre-trained language models -- have improved search engines in various ways. Most prominently, they can estimate the relevance of a passage or document to a user's query. In this work, we depart from this direction by exploring whether neural networks can effectively predict which of a document's passages are unlikely to be relevant to any query… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12170v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12170v1-abstract-full" style="display: none;"> Neural networks -- especially those that use large, pre-trained language models -- have improved search engines in various ways. Most prominently, they can estimate the relevance of a passage or document to a user's query. In this work, we depart from this direction by exploring whether neural networks can effectively predict which of a document's passages are unlikely to be relevant to any query submitted to the search engine. We refer to this query-agnostic estimation of passage relevance as a passage's quality. We find that our novel methods for estimating passage quality allow passage corpora to be pruned considerably while maintaining statistically equivalent effectiveness; our best methods can consistently prune >25% of passages in a corpora, across various retrieval pipelines. Such substantial pruning reduces the operating costs of neural search engines in terms of computing resources, power usage, and carbon footprint -- both when processing queries (thanks to a smaller index size) and when indexing (lightweight models can prune low-quality passages prior to the costly dense or learned sparse encoding step). This work sets the stage for developing more advanced neural "learning-what-to-index" methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12170v1-abstract-full').style.display = 'none'; document.getElementById('2407.12170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SIGIR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11320">arXiv:2407.11320</a> <span> [<a href="https://arxiv.org/pdf/2407.11320">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> A2E: Attribute-based Anonymity-Enhanced Authentication for Accessing Driverless Taxi Service </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yanwei Gong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaolin Chang</a>, <a href="/search/cs?searchtype=author&query=Mi%C5%A1i%C4%87%2C+J">Jelena Mi拧i膰</a>, <a href="/search/cs?searchtype=author&query=Mi%C5%A1i%C4%87%2C+V+B">Vojislav B. Mi拧i膰</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11320v2-abstract-short" style="display: inline;"> Driverless vehicle as a taxi is gaining more attention due to its potential to enhance urban transportation efficiency. However, both unforeseen incidents led by unsupervised physical users' driverless taxi (DT) rides and personalized needs of users when riding in a DT necessitate the authentication of user identity and attributes. Moreover, safeguarding user identity privacy and quickly tracing m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11320v2-abstract-full').style.display = 'inline'; document.getElementById('2407.11320v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11320v2-abstract-full" style="display: none;"> Driverless vehicle as a taxi is gaining more attention due to its potential to enhance urban transportation efficiency. However, both unforeseen incidents led by unsupervised physical users' driverless taxi (DT) rides and personalized needs of users when riding in a DT necessitate the authentication of user identity and attributes. Moreover, safeguarding user identity privacy and quickly tracing malicious users if necessary to enhance the adoption of DTs remains a challenge. This paper proposes a novel Attribute-based Anonymity Enhanced (A2E) authentication scheme for users to access DT service. From the security aspect, A2E has attribute verifiability, which is achieved by designing a user attribute credential based on redactable signature. Meanwhile, this attribute credential also satisfies unlinkability and unforgeability. In addition, A2E has enhanced anonymity, which is achieved by designing a decentralized credential issuance mechanism utilizing ring signature and secret sharing, safeguarding user attributes from association with anonymous identities. Moreover, this mechanism provides traceability and non-frameability to users. From the performance aspect, A2E causes low overhead when tracing malicious users and updating credentials. Besides, both scalability and lightweight are satisfied, which contributes to A2E's practicability. We conduct security analysis and performance evaluation to the security and performance capabilities of A2E. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11320v2-abstract-full').style.display = 'none'; document.getElementById('2407.11320v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09618">arXiv:2407.09618</a> <span> [<a href="https://arxiv.org/pdf/2407.09618">pdf</a>, <a href="https://arxiv.org/format/2407.09618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> The Heterophilic Graph Learning Handbook: Benchmarks, Models, Theoretical Analysis, Applications and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+C">Chenqing Hua</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Qincheng Lu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Liheng Ma</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lirong Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minkai Xu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiao-Wen Chang</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+R">Rex Ying</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S+Z">Stan Z. Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jian Tang</a>, <a href="/search/cs?searchtype=author&query=Wolf%2C+G">Guy Wolf</a>, <a href="/search/cs?searchtype=author&query=Jegelka%2C+S">Stefanie Jegelka</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09618v1-abstract-short" style="display: inline;"> Homophily principle, \ie{} nodes with the same labels or similar attributes are more likely to be connected, has been commonly believed to be the main reason for the superiority of Graph Neural Networks (GNNs) over traditional Neural Networks (NNs) on graph-structured data, especially on node-level tasks. However, recent work has identified a non-trivial set of datasets where GNN's performance com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09618v1-abstract-full').style.display = 'inline'; document.getElementById('2407.09618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09618v1-abstract-full" style="display: none;"> Homophily principle, \ie{} nodes with the same labels or similar attributes are more likely to be connected, has been commonly believed to be the main reason for the superiority of Graph Neural Networks (GNNs) over traditional Neural Networks (NNs) on graph-structured data, especially on node-level tasks. However, recent work has identified a non-trivial set of datasets where GNN's performance compared to the NN's is not satisfactory. Heterophily, i.e. low homophily, has been considered the main cause of this empirical observation. People have begun to revisit and re-evaluate most existing graph models, including graph transformer and its variants, in the heterophily scenario across various kinds of graphs, e.g. heterogeneous graphs, temporal graphs and hypergraphs. Moreover, numerous graph-related applications are found to be closely related to the heterophily problem. In the past few years, considerable effort has been devoted to studying and addressing the heterophily issue. In this survey, we provide a comprehensive review of the latest progress on heterophilic graph learning, including an extensive summary of benchmark datasets and evaluation of homophily metrics on synthetic graphs, meticulous classification of the most updated supervised and unsupervised learning methods, thorough digestion of the theoretical analysis on homophily/heterophily, and broad exploration of the heterophily-related applications. Notably, through detailed experiments, we are the first to categorize benchmark heterophilic datasets into three sub-categories: malignant, benign and ambiguous heterophily. Malignant and ambiguous datasets are identified as the real challenging datasets to test the effectiveness of new models on the heterophily challenge. Finally, we propose several challenges and future directions for heterophilic graph representation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09618v1-abstract-full').style.display = 'none'; document.getElementById('2407.09618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Suggestions and comments are welcomed at sitao.luan@mail.mcgill.ca!</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08126">arXiv:2407.08126</a> <span> [<a href="https://arxiv.org/pdf/2407.08126">pdf</a>, <a href="https://arxiv.org/format/2407.08126">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Label-anticipated Event Disentanglement for Audio-Visual Video Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinxing Zhou</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Dan Guo</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yuxin Mao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08126v1-abstract-short" style="display: inline;"> Audio-Visual Video Parsing (AVVP) task aims to detect and temporally locate events within audio and visual modalities. Multiple events can overlap in the timeline, making identification challenging. While traditional methods usually focus on improving the early audio-visual encoders to embed more effective features, the decoding phase -- crucial for final event classification, often receives less… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08126v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08126v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08126v1-abstract-full" style="display: none;"> Audio-Visual Video Parsing (AVVP) task aims to detect and temporally locate events within audio and visual modalities. Multiple events can overlap in the timeline, making identification challenging. While traditional methods usually focus on improving the early audio-visual encoders to embed more effective features, the decoding phase -- crucial for final event classification, often receives less attention. We aim to advance the decoding phase and improve its interpretability. Specifically, we introduce a new decoding paradigm, \underline{l}abel s\underline{e}m\underline{a}ntic-based \underline{p}rojection (LEAP), that employs labels texts of event categories, each bearing distinct and explicit semantics, for parsing potentially overlapping events.LEAP works by iteratively projecting encoded latent features of audio/visual segments onto semantically independent label embeddings. This process, enriched by modeling cross-modal (audio/visual-label) interactions, gradually disentangles event semantics within video segments to refine relevant label embeddings, guaranteeing a more discriminative and interpretable decoding process. To facilitate the LEAP paradigm, we propose a semantic-aware optimization strategy, which includes a novel audio-visual semantic similarity loss function. This function leverages the Intersection over Union of audio and visual events (EIoU) as a novel metric to calibrate audio-visual similarities at the feature level, accommodating the varied event densities across modalities. Extensive experiments demonstrate the superiority of our method, achieving new state-of-the-art performance for AVVP and also enhancing the relevant audio-visual event localization task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08126v1-abstract-full').style.display = 'none'; document.getElementById('2407.08126v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00837">arXiv:2407.00837</a> <span> [<a href="https://arxiv.org/pdf/2407.00837">pdf</a>, <a href="https://arxiv.org/format/2407.00837">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Speech Representation Learning for Thousands of Languages </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wangyou Zhang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yifan Peng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinjian Li</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Maiti%2C+S">Soumi Maiti</a>, <a href="/search/cs?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00837v2-abstract-short" style="display: inline;"> Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world's 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 millio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00837v2-abstract-full').style.display = 'inline'; document.getElementById('2407.00837v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00837v2-abstract-full" style="display: none;"> Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world's 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00837v2-abstract-full').style.display = 'none'; document.getElementById('2407.00837v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Updated affiliations; 20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12723">arXiv:2406.12723</a> <span> [<a href="https://arxiv.org/pdf/2406.12723">pdf</a>, <a href="https://arxiv.org/format/2406.12723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BIOSCAN-5M: A Multimodal Dataset for Insect Biodiversity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gharaee%2C+Z">Zahra Gharaee</a>, <a href="/search/cs?searchtype=author&query=Lowe%2C+S+C">Scott C. Lowe</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Z">ZeMing Gong</a>, <a href="/search/cs?searchtype=author&query=Arias%2C+P+M">Pablo Millan Arias</a>, <a href="/search/cs?searchtype=author&query=Pellegrino%2C+N">Nicholas Pellegrino</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A+T">Austin T. Wang</a>, <a href="/search/cs?searchtype=author&query=Haurum%2C+J+B">Joakim Bruslund Haurum</a>, <a href="/search/cs?searchtype=author&query=Zarubiieva%2C+I">Iuliia Zarubiieva</a>, <a href="/search/cs?searchtype=author&query=Kari%2C+L">Lila Kari</a>, <a href="/search/cs?searchtype=author&query=Steinke%2C+D">Dirk Steinke</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+G+W">Graham W. Taylor</a>, <a href="/search/cs?searchtype=author&query=Fieguth%2C+P">Paul Fieguth</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12723v4-abstract-short" style="display: inline;"> As part of an ongoing worldwide effort to comprehend and monitor insect biodiversity, this paper presents the BIOSCAN-5M Insect dataset to the machine learning community and establish several benchmark tasks. BIOSCAN-5M is a comprehensive dataset containing multi-modal information for over 5 million insect specimens, and it significantly expands existing image-based biological datasets by includin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12723v4-abstract-full').style.display = 'inline'; document.getElementById('2406.12723v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12723v4-abstract-full" style="display: none;"> As part of an ongoing worldwide effort to comprehend and monitor insect biodiversity, this paper presents the BIOSCAN-5M Insect dataset to the machine learning community and establish several benchmark tasks. BIOSCAN-5M is a comprehensive dataset containing multi-modal information for over 5 million insect specimens, and it significantly expands existing image-based biological datasets by including taxonomic labels, raw nucleotide barcode sequences, assigned barcode index numbers, geographical, and size information. We propose three benchmark experiments to demonstrate the impact of the multi-modal data types on the classification and clustering accuracy. First, we pretrain a masked language model on the DNA barcode sequences of the BIOSCAN-5M dataset, and demonstrate the impact of using this large reference library on species- and genus-level classification performance. Second, we propose a zero-shot transfer learning task applied to images and DNA barcodes to cluster feature embeddings obtained from self-supervised learning, to investigate whether meaningful clusters can be derived from these representation embeddings. Third, we benchmark multi-modality by performing contrastive learning on DNA barcodes, image data, and taxonomic information. This yields a general shared embedding space enabling taxonomic classification using multiple types of information and modalities. The code repository of the BIOSCAN-5M Insect dataset is available at https://github.com/bioscan-ml/BIOSCAN-5M. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12723v4-abstract-full').style.display = 'none'; document.getElementById('2406.12723v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11579">arXiv:2406.11579</a> <span> [<a href="https://arxiv.org/pdf/2406.11579">pdf</a>, <a href="https://arxiv.org/format/2406.11579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Duoduo CLIP: Efficient 3D Understanding with Multi-View Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+H">Han-Hung Lee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11579v2-abstract-short" style="display: inline;"> We introduce Duoduo CLIP, a model for 3D representation learning that learns shape encodings from multi-view images instead of point-clouds. The choice of multi-view images allows us to leverage 2D priors from off-the-shelf CLIP models to facilitate fine-tuning with 3D data. Our approach not only shows better generalization compared to existing point cloud methods, but also reduces GPU requirement… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11579v2-abstract-full').style.display = 'inline'; document.getElementById('2406.11579v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11579v2-abstract-full" style="display: none;"> We introduce Duoduo CLIP, a model for 3D representation learning that learns shape encodings from multi-view images instead of point-clouds. The choice of multi-view images allows us to leverage 2D priors from off-the-shelf CLIP models to facilitate fine-tuning with 3D data. Our approach not only shows better generalization compared to existing point cloud methods, but also reduces GPU requirements and training time. In addition, the model is modified with cross-view attention to leverage information across multiple frames of the object which further boosts performance. Notably, our model is permutation invariant to the order of multi-view images while being pose-free. Compared to the current SOTA point cloud method that requires 480 A100 hours to train 1 billion model parameters we only require 57 A5000 hours and 87 million parameters. Multi-view images also provide more flexibility including being able to encode objects with a variable number of images, and performance scales when more views are used. In contrast, point cloud based methods require an entire scan or model of the object. We showcase this flexibility with benchmarks from images of real-world objects. Our model also achieves better performance in more fine-grained text to shape retrieval, demonstrating better text-and-shape alignment than point cloud based models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11579v2-abstract-full').style.display = 'none'; document.getElementById('2406.11579v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08641">arXiv:2406.08641</a> <span> [<a href="https://arxiv.org/pdf/2406.08641">pdf</a>, <a href="https://arxiv.org/ps/2406.08641">ps</a>, <a href="https://arxiv.org/format/2406.08641">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ML-SUPERB 2.0: Benchmarking Multilingual Speech Models Across Modeling Constraints, Languages, and Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Bartelds%2C+M">Martijn Bartelds</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+V+B">Vanya Bannihatti Kumar</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Jurafsky%2C+D">Dan Jurafsky</a>, <a href="/search/cs?searchtype=author&query=Livescu%2C+K">Karen Livescu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08641v1-abstract-short" style="display: inline;"> ML-SUPERB evaluates self-supervised learning (SSL) models on the tasks of language identification and automatic speech recognition (ASR). This benchmark treats the models as feature extractors and uses a single shallow downstream model, which can be fine-tuned for a downstream task. However, real-world use cases may require different configurations. This paper presents ML-SUPERB~2.0, which is a ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08641v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08641v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08641v1-abstract-full" style="display: none;"> ML-SUPERB evaluates self-supervised learning (SSL) models on the tasks of language identification and automatic speech recognition (ASR). This benchmark treats the models as feature extractors and uses a single shallow downstream model, which can be fine-tuned for a downstream task. However, real-world use cases may require different configurations. This paper presents ML-SUPERB~2.0, which is a new benchmark for evaluating pre-trained SSL and supervised speech models across downstream models, fine-tuning setups, and efficient model adaptation approaches. We find performance improvements over the setup of ML-SUPERB. However, performance depends on the downstream model design. Also, we find large performance differences between languages and datasets, suggesting the need for more targeted approaches to improve multilingual ASR performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08641v1-abstract-full').style.display = 'none'; document.getElementById('2406.08641v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07725">arXiv:2406.07725</a> <span> [<a href="https://arxiv.org/pdf/2406.07725">pdf</a>, <a href="https://arxiv.org/ps/2406.07725">ps</a>, <a href="https://arxiv.org/format/2406.07725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Interspeech 2024 Challenge on Speech Processing Using Discrete Units </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jinchuan Tian</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuning Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuxun Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yihan Wu</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Adi%2C+Y">Yossi Adi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qin Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07725v1-abstract-short" style="display: inline;"> Representing speech and audio signals in discrete units has become a compelling alternative to traditional high-dimensional feature vectors. Numerous studies have highlighted the efficacy of discrete units in various applications such as speech compression and restoration, speech recognition, and speech generation. To foster exploration in this domain, we introduce the Interspeech 2024 Challenge,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07725v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07725v1-abstract-full" style="display: none;"> Representing speech and audio signals in discrete units has become a compelling alternative to traditional high-dimensional feature vectors. Numerous studies have highlighted the efficacy of discrete units in various applications such as speech compression and restoration, speech recognition, and speech generation. To foster exploration in this domain, we introduce the Interspeech 2024 Challenge, which focuses on new speech processing benchmarks using discrete units. It encompasses three pivotal tasks, namely multilingual automatic speech recognition, text-to-speech, and singing voice synthesis, and aims to assess the potential applicability of discrete units in these tasks. This paper outlines the challenge designs and baseline descriptions. We also collate baseline and selected submission systems, along with preliminary findings, offering valuable contributions to future research in this evolving field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07725v1-abstract-full').style.display = 'none'; document.getElementById('2406.07725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This manuscript has been accepted by Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06999">arXiv:2406.06999</a> <span> [<a href="https://arxiv.org/pdf/2406.06999">pdf</a>, <a href="https://arxiv.org/format/2406.06999">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Teaching with Uncertainty: Unleashing the Potential of Knowledge Distillation in Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+J">Junfei Yi</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jianxu Mao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tengfei Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Hanyu Gu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaonan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06999v1-abstract-short" style="display: inline;"> Knowledge distillation (KD) is a widely adopted and effective method for compressing models in object detection tasks. Particularly, feature-based distillation methods have shown remarkable performance. Existing approaches often ignore the uncertainty in the teacher model's knowledge, which stems from data noise and imperfect training. This limits the student model's ability to learn latent knowle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06999v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06999v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06999v1-abstract-full" style="display: none;"> Knowledge distillation (KD) is a widely adopted and effective method for compressing models in object detection tasks. Particularly, feature-based distillation methods have shown remarkable performance. Existing approaches often ignore the uncertainty in the teacher model's knowledge, which stems from data noise and imperfect training. This limits the student model's ability to learn latent knowledge, as it may overly rely on the teacher's imperfect guidance. In this paper, we propose a novel feature-based distillation paradigm with knowledge uncertainty for object detection, termed "Uncertainty Estimation-Discriminative Knowledge Extraction-Knowledge Transfer (UET)", which can seamlessly integrate with existing distillation methods. By leveraging the Monte Carlo dropout technique, we introduce knowledge uncertainty into the training process of the student model, facilitating deeper exploration of latent knowledge. Our method performs effectively during the KD process without requiring intricate structures or extensive computational resources. Extensive experiments validate the effectiveness of our proposed approach across various distillation strategies, detectors, and backbone architectures. Specifically, following our proposed paradigm, the existing FGD method achieves state-of-the-art (SoTA) performance, with ResNet50-based GFL achieving 44.1% mAP on the COCO dataset, surpassing the baselines by 3.9%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06999v1-abstract-full').style.display = 'none'; document.getElementById('2406.06999v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02990">arXiv:2406.02990</a> <span> [<a href="https://arxiv.org/pdf/2406.02990">pdf</a>, <a href="https://arxiv.org/format/2406.02990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Predicting Genetic Mutation from Whole Slide Images via Biomedical-Linguistic Knowledge Enhanced Multi-label Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+G">Gexin Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chenfei Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingjie Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiaojun Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02990v1-abstract-short" style="display: inline;"> Predicting genetic mutations from whole slide images is indispensable for cancer diagnosis. However, existing work training multiple binary classification models faces two challenges: (a) Training multiple binary classifiers is inefficient and would inevitably lead to a class imbalance problem. (b) The biological relationships among genes are overlooked, which limits the prediction performance. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02990v1-abstract-full').style.display = 'inline'; document.getElementById('2406.02990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02990v1-abstract-full" style="display: none;"> Predicting genetic mutations from whole slide images is indispensable for cancer diagnosis. However, existing work training multiple binary classification models faces two challenges: (a) Training multiple binary classifiers is inefficient and would inevitably lead to a class imbalance problem. (b) The biological relationships among genes are overlooked, which limits the prediction performance. To tackle these challenges, we innovatively design a Biological-knowledge enhanced PathGenomic multi-label Transformer to improve genetic mutation prediction performances. BPGT first establishes a novel gene encoder that constructs gene priors by two carefully designed modules: (a) A gene graph whose node features are the genes' linguistic descriptions and the cancer phenotype, with edges modeled by genes' pathway associations and mutation consistencies. (b) A knowledge association module that fuses linguistic and biomedical knowledge into gene priors by transformer-based graph representation learning, capturing the intrinsic relationships between different genes' mutations. BPGT then designs a label decoder that finally performs genetic mutation prediction by two tailored modules: (a) A modality fusion module that firstly fuses the gene priors with critical regions in WSIs and obtains gene-wise mutation logits. (b) A comparative multi-label loss that emphasizes the inherent comparisons among mutation status to enhance the discrimination capabilities. Sufficient experiments on The Cancer Genome Atlas benchmark demonstrate that BPGT outperforms the state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02990v1-abstract-full').style.display = 'none'; document.getElementById('2406.02990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 8 figures, and 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17761">arXiv:2405.17761</a> <span> [<a href="https://arxiv.org/pdf/2405.17761">pdf</a>, <a href="https://arxiv.org/format/2405.17761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Double Variance Reduction: A Smoothing Trick for Composite Optimization Problems without First-Order Gradient </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Di%2C+H">Hao Di</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Haishan Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yueling Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangyu Chang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+G">Guang Dai</a>, <a href="/search/cs?searchtype=author&query=Tsang%2C+I+W">Ivor W. Tsang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17761v1-abstract-short" style="display: inline;"> Variance reduction techniques are designed to decrease the sampling variance, thereby accelerating convergence rates of first-order (FO) and zeroth-order (ZO) optimization methods. However, in composite optimization problems, ZO methods encounter an additional variance called the coordinate-wise variance, which stems from the random gradient estimation. To reduce this variance, prior works require… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17761v1-abstract-full').style.display = 'inline'; document.getElementById('2405.17761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17761v1-abstract-full" style="display: none;"> Variance reduction techniques are designed to decrease the sampling variance, thereby accelerating convergence rates of first-order (FO) and zeroth-order (ZO) optimization methods. However, in composite optimization problems, ZO methods encounter an additional variance called the coordinate-wise variance, which stems from the random gradient estimation. To reduce this variance, prior works require estimating all partial derivatives, essentially approximating FO information. This approach demands O(d) function evaluations (d is the dimension size), which incurs substantial computational costs and is prohibitive in high-dimensional scenarios. This paper proposes the Zeroth-order Proximal Double Variance Reduction (ZPDVR) method, which utilizes the averaging trick to reduce both sampling and coordinate-wise variances. Compared to prior methods, ZPDVR relies solely on random gradient estimates, calls the stochastic zeroth-order oracle (SZO) in expectation $\mathcal{O}(1)$ times per iteration, and achieves the optimal $\mathcal{O}(d(n + 魏)\log (\frac{1}蔚))$ SZO query complexity in the strongly convex and smooth setting, where $魏$ represents the condition number and $蔚$ is the desired accuracy. Empirical results validate ZPDVR's linear convergence and demonstrate its superior performance over other related methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17761v1-abstract-full').style.display = 'none'; document.getElementById('2405.17761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17537">arXiv:2405.17537</a> <span> [<a href="https://arxiv.org/pdf/2405.17537">pdf</a>, <a href="https://arxiv.org/format/2405.17537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Z">ZeMing Gong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A+T">Austin T. Wang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+X">Xiaoliang Huo</a>, <a href="/search/cs?searchtype=author&query=Haurum%2C+J+B">Joakim Bruslund Haurum</a>, <a href="/search/cs?searchtype=author&query=Lowe%2C+S+C">Scott C. Lowe</a>, <a href="/search/cs?searchtype=author&query=Taylor%2C+G+W">Graham W. Taylor</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17537v3-abstract-short" style="display: inline;"> Measuring biodiversity is crucial for understanding ecosystem health. While prior works have developed machine learning models for taxonomic classification of photographic images and DNA separately, in this work, we introduce a multimodal approach combining both, using CLIP-style contrastive learning to align images, barcode DNA, and text-based representations of taxonomic labels in a unified embe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17537v3-abstract-full').style.display = 'inline'; document.getElementById('2405.17537v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17537v3-abstract-full" style="display: none;"> Measuring biodiversity is crucial for understanding ecosystem health. While prior works have developed machine learning models for taxonomic classification of photographic images and DNA separately, in this work, we introduce a multimodal approach combining both, using CLIP-style contrastive learning to align images, barcode DNA, and text-based representations of taxonomic labels in a unified embedding space. This allows for accurate classification of both known and unknown insect species without task-specific fine-tuning, leveraging contrastive learning for the first time to fuse DNA and image data. Our method surpasses previous single-modality approaches in accuracy by over 8% on zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17537v3-abstract-full').style.display = 'none'; document.getElementById('2405.17537v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages with 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10255">arXiv:2405.10255</a> <span> [<a href="https://arxiv.org/pdf/2405.10255">pdf</a>, <a href="https://arxiv.org/format/2405.10255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> When LLMs step into the 3D World: A Survey and Meta-Analysis of 3D Tasks via Multi-modal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xianzheng Ma</a>, <a href="/search/cs?searchtype=author&query=Bhalgat%2C+Y">Yash Bhalgat</a>, <a href="/search/cs?searchtype=author&query=Smart%2C+B">Brandon Smart</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuai Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinghui Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jian Ding</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jindong Gu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Dave Zhenyu Chen</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+S">Songyou Peng</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jia-Wang Bian</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P+H">Philip H Torr</a>, <a href="/search/cs?searchtype=author&query=Pollefeys%2C+M">Marc Pollefeys</a>, <a href="/search/cs?searchtype=author&query=Nie%C3%9Fner%2C+M">Matthias Nie脽ner</a>, <a href="/search/cs?searchtype=author&query=Reid%2C+I+D">Ian D Reid</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X. Chang</a>, <a href="/search/cs?searchtype=author&query=Laina%2C+I">Iro Laina</a>, <a href="/search/cs?searchtype=author&query=Prisacariu%2C+V+A">Victor Adrian Prisacariu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10255v1-abstract-short" style="display: inline;"> As large language models (LLMs) evolve, their integration with 3D spatial data (3D-LLMs) has seen rapid progress, offering unprecedented capabilities for understanding and interacting with physical spaces. This survey provides a comprehensive overview of the methodologies enabling LLMs to process, understand, and generate 3D data. Highlighting the unique advantages of LLMs, such as in-context lear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10255v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10255v1-abstract-full" style="display: none;"> As large language models (LLMs) evolve, their integration with 3D spatial data (3D-LLMs) has seen rapid progress, offering unprecedented capabilities for understanding and interacting with physical spaces. This survey provides a comprehensive overview of the methodologies enabling LLMs to process, understand, and generate 3D data. Highlighting the unique advantages of LLMs, such as in-context learning, step-by-step reasoning, open-vocabulary capabilities, and extensive world knowledge, we underscore their potential to significantly advance spatial comprehension and interaction within embodied Artificial Intelligence (AI) systems. Our investigation spans various 3D data representations, from point clouds to Neural Radiance Fields (NeRFs). It examines their integration with LLMs for tasks such as 3D scene understanding, captioning, question-answering, and dialogue, as well as LLM-based agents for spatial reasoning, planning, and navigation. The paper also includes a brief review of other methods that integrate 3D and language. The meta-analysis presented in this paper reveals significant progress yet underscores the necessity for novel approaches to harness the full potential of 3D-LLMs. Hence, with this paper, we aim to chart a course for future research that explores and expands the capabilities of 3D-LLMs in understanding and interacting with the complex 3D world. To support this survey, we have established a project page where papers related to our topic are organized and listed: https://github.com/ActiveVisionLab/Awesome-LLM-3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10255v1-abstract-full').style.display = 'none'; document.getElementById('2405.10255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06747">arXiv:2405.06747</a> <span> [<a href="https://arxiv.org/pdf/2405.06747">pdf</a>, <a href="https://arxiv.org/format/2405.06747">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Music Emotion Prediction Using Recurrent Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xinyu Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoruo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+Y">Yulu Ran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06747v1-abstract-short" style="display: inline;"> This study explores the application of recurrent neural networks to recognize emotions conveyed in music, aiming to enhance music recommendation systems and support therapeutic interventions by tailoring music to fit listeners' emotional states. We utilize Russell's Emotion Quadrant to categorize music into four distinct emotional regions and develop models capable of accurately predicting these c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06747v1-abstract-full').style.display = 'inline'; document.getElementById('2405.06747v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06747v1-abstract-full" style="display: none;"> This study explores the application of recurrent neural networks to recognize emotions conveyed in music, aiming to enhance music recommendation systems and support therapeutic interventions by tailoring music to fit listeners' emotional states. We utilize Russell's Emotion Quadrant to categorize music into four distinct emotional regions and develop models capable of accurately predicting these categories. Our approach involves extracting a comprehensive set of audio features using Librosa and applying various recurrent neural network architectures, including standard RNNs, Bidirectional RNNs, and Long Short-Term Memory (LSTM) networks. Initial experiments are conducted using a dataset of 900 audio clips, labeled according to the emotional quadrants. We compare the performance of our neural network models against a set of baseline classifiers and analyze their effectiveness in capturing the temporal dynamics inherent in musical expression. The results indicate that simpler RNN architectures may perform comparably or even superiorly to more complex models, particularly in smaller datasets. We've also applied the following experiments on larger datasets: one is augmented based on our original dataset, and the other is from other sources. This research not only enhances our understanding of the emotional impact of music but also demonstrates the potential of neural networks in creating more personalized and emotionally resonant music recommendation and therapy systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06747v1-abstract-full').style.display = 'none'; document.getElementById('2405.06747v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.05010">arXiv:2405.05010</a> <span> [<a href="https://arxiv.org/pdf/2405.05010">pdf</a>, <a href="https://arxiv.org/format/2405.05010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ${M^2D}$NeRF: Multi-Modal Decomposition NeRF with 3D Feature Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+N">Ning Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lefei Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A+X">Angel X Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.05010v1-abstract-short" style="display: inline;"> Neural fields (NeRF) have emerged as a promising approach for representing continuous 3D scenes. Nevertheless, the lack of semantic encoding in NeRFs poses a significant challenge for scene decomposition. To address this challenge, we present a single model, Multi-Modal Decomposition NeRF (${M^2D}$NeRF), that is capable of both text-based and visual patch-based edits. Specifically, we use multi-mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05010v1-abstract-full').style.display = 'inline'; document.getElementById('2405.05010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.05010v1-abstract-full" style="display: none;"> Neural fields (NeRF) have emerged as a promising approach for representing continuous 3D scenes. Nevertheless, the lack of semantic encoding in NeRFs poses a significant challenge for scene decomposition. To address this challenge, we present a single model, Multi-Modal Decomposition NeRF (${M^2D}$NeRF), that is capable of both text-based and visual patch-based edits. Specifically, we use multi-modal feature distillation to integrate teacher features from pretrained visual and language models into 3D semantic feature volumes, thereby facilitating consistent 3D editing. To enforce consistency between the visual and language features in our 3D feature volumes, we introduce a multi-modal similarity constraint. We also introduce a patch-based joint contrastive loss that helps to encourage object-regions to coalesce in the 3D feature space, resulting in more precise boundaries. Experiments on various real-world scenes show superior performance in 3D scene decomposition tasks compared to prior NeRF-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.05010v1-abstract-full').style.display = 'none'; document.getElementById('2405.05010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chang%2C+X&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chang%2C+X&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository