CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 164 results for author: <span class="mathjax">Feng, T</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Feng%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Feng, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Feng%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Feng, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Feng%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12198">arXiv:2411.12198</a> <span> [<a href="https://arxiv.org/pdf/2411.12198">pdf</a>, <a href="https://arxiv.org/format/2411.12198">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CCIS-Diff: A Generative Model with Stable Diffusion Prior for Controlled Colonoscopy Image Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yifan Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingge Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fei Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12198v1-abstract-short" style="display: inline;"> Colonoscopy is crucial for identifying adenomatous polyps and preventing colorectal cancer. However, developing robust models for polyp detection is challenging by the limited size and accessibility of existing colonoscopy datasets. While previous efforts have attempted to synthesize colonoscopy images, current methods suffer from instability and insufficient data diversity. Moreover, these approa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12198v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12198v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12198v1-abstract-full" style="display: none;"> Colonoscopy is crucial for identifying adenomatous polyps and preventing colorectal cancer. However, developing robust models for polyp detection is challenging by the limited size and accessibility of existing colonoscopy datasets. While previous efforts have attempted to synthesize colonoscopy images, current methods suffer from instability and insufficient data diversity. Moreover, these approaches lack precise control over the generation process, resulting in images that fail to meet clinical quality standards. To address these challenges, we propose CCIS-DIFF, a Controlled generative model for high-quality Colonoscopy Image Synthesis based on a Diffusion architecture. Our method offers precise control over both the spatial attributes (polyp location and shape) and clinical characteristics of polyps that align with clinical descriptions. Specifically, we introduce a blur mask weighting strategy to seamlessly blend synthesized polyps with the colonic mucosa, and a text-aware attention mechanism to guide the generated images to reflect clinical characteristics. Notably, to achieve this, we construct a new multi-modal colonoscopy dataset that integrates images, mask annotations, and corresponding clinical text descriptions. Experimental results demonstrate that our method generates high-quality, diverse colonoscopy images with fine control over both spatial constraints and clinical consistency, offering valuable support for downstream segmentation and diagnostic tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12198v1-abstract-full').style.display = 'none'; document.getElementById('2411.12198v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10761">arXiv:2411.10761</a> <span> [<a href="https://arxiv.org/pdf/2411.10761">pdf</a>, <a href="https://arxiv.org/format/2411.10761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Generic LLMs Help Analyze Child-adult Interactions Involving Children with Autism in Clinical Observation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+A">Anfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Lahiri%2C+R">Rimita Lahiri</a>, <a href="/search/cs?searchtype=author&query=Tager-Flusberg%2C+H">Helen Tager-Flusberg</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+H">So Hyun Kim</a>, <a href="/search/cs?searchtype=author&query=Bishop%2C+S">Somer Bishop</a>, <a href="/search/cs?searchtype=author&query=Lord%2C+C">Catherine Lord</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10761v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown significant potential in understanding human communication and interaction. However, their performance in the domain of child-inclusive interactions, including in clinical settings, remains less explored. In this work, we evaluate generic LLMs' ability to analyze child-adult dyadic interactions in a clinically relevant context involving children with ASD. Sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10761v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10761v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown significant potential in understanding human communication and interaction. However, their performance in the domain of child-inclusive interactions, including in clinical settings, remains less explored. In this work, we evaluate generic LLMs' ability to analyze child-adult dyadic interactions in a clinically relevant context involving children with ASD. Specifically, we explore LLMs in performing four tasks: classifying child-adult utterances, predicting engaged activities, recognizing language skills and understanding traits that are clinically relevant. Our evaluation shows that generic LLMs are highly capable of analyzing long and complex conversations in clinical observation sessions, often surpassing the performance of non-expert human evaluators. The results show their potential to segment interactions of interest, assist in language skills evaluation, identify engaged activities, and offer clinical-relevant context for assessments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10761v1-abstract-full').style.display = 'none'; document.getElementById('2411.10761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">GenAI for Health Workshop, NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16991">arXiv:2410.16991</a> <span> [<a href="https://arxiv.org/pdf/2410.16991">pdf</a>, <a href="https://arxiv.org/format/2410.16991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3680533.3697064">10.1145/3680533.3697064 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An Eye for an AI: Evaluating GPT-4o's Visual Perception Skills and Geometric Reasoning Skills Using Computer Graphics Questions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T+H">Tony Haoran Feng</a>, <a href="/search/cs?searchtype=author&query=Denny%2C+P">Paul Denny</a>, <a href="/search/cs?searchtype=author&query=W%C3%BCnsche%2C+B+C">Burkhard C. W眉nsche</a>, <a href="/search/cs?searchtype=author&query=Luxton-Reilly%2C+A">Andrew Luxton-Reilly</a>, <a href="/search/cs?searchtype=author&query=Whalley%2C+J">Jacqueline Whalley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16991v1-abstract-short" style="display: inline;"> CG (Computer Graphics) is a popular field of CS (Computer Science), but many students find this topic difficult due to it requiring a large number of skills, such as mathematics, programming, geometric reasoning, and creativity. Over the past few years, researchers have investigated ways to harness the power of GenAI (Generative Artificial Intelligence) to improve teaching. In CS, much of the rese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16991v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16991v1-abstract-full" style="display: none;"> CG (Computer Graphics) is a popular field of CS (Computer Science), but many students find this topic difficult due to it requiring a large number of skills, such as mathematics, programming, geometric reasoning, and creativity. Over the past few years, researchers have investigated ways to harness the power of GenAI (Generative Artificial Intelligence) to improve teaching. In CS, much of the research has focused on introductory computing. A recent study evaluating the performance of an LLM (Large Language Model), GPT-4 (text-only), on CG questions, indicated poor performance and reliance on detailed descriptions of image content, which often required considerable insight from the user to return reasonable results. So far, no studies have investigated the abilities of LMMs (Large Multimodal Models), or multimodal LLMs, to solve CG questions and how these abilities can be used to improve teaching. In this study, we construct two datasets of CG questions requiring varying degrees of visual perception skills and geometric reasoning skills, and evaluate the current state-of-the-art LMM, GPT-4o, on the two datasets. We find that although GPT-4o exhibits great potential in solving questions with visual information independently, major limitations still exist to the accuracy and quality of the generated results. We propose several novel approaches for CG educators to incorporate GenAI into CG teaching despite these limitations. We hope that our guidelines further encourage learning and engagement in CG classrooms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16991v1-abstract-full').style.display = 'none'; document.getElementById('2410.16991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, 1 table, to be published in SIGGRAPH Asia 2024 Educator's Forum</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7; I.3.0; K.3.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14767">arXiv:2410.14767</a> <span> [<a href="https://arxiv.org/pdf/2410.14767">pdf</a>, <a href="https://arxiv.org/format/2410.14767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Soft Condensed Matter">cond-mat.soft</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning Aided Modeling of Granular Materials: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengqi Wang</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+K">Krishna Kumar</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y+T">Y. T. Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+T">Tongming Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Min Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14767v1-abstract-short" style="display: inline;"> Artificial intelligence (AI) has become a buzz word since Google's AlphaGo beat a world champion in 2017. In the past five years, machine learning as a subset of the broader category of AI has obtained considerable attention in the research community of granular materials. This work offers a detailed review of the recent advances in machine learning-aided studies of granular materials from the par… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14767v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14767v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14767v1-abstract-full" style="display: none;"> Artificial intelligence (AI) has become a buzz word since Google's AlphaGo beat a world champion in 2017. In the past five years, machine learning as a subset of the broader category of AI has obtained considerable attention in the research community of granular materials. This work offers a detailed review of the recent advances in machine learning-aided studies of granular materials from the particle-particle interaction at the grain level to the macroscopic simulations of granular flow. This work will start with the application of machine learning in the microscopic particle-particle interaction and associated contact models. Then, different neural networks for learning the constitutive behaviour of granular materials will be reviewed and compared. Finally, the macroscopic simulations of practical engineering or boundary value problems based on the combination of neural networks and numerical methods are discussed. We hope readers will have a clear idea of the development of machine learning-aided modelling of granular materials via this comprehensive review work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14767v1-abstract-full').style.display = 'none'; document.getElementById('2410.14767v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to Archives of Computational Methods in Engineering</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13185">arXiv:2410.13185</a> <span> [<a href="https://arxiv.org/pdf/2410.13185">pdf</a>, <a href="https://arxiv.org/format/2410.13185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Chain of Ideas: Revolutionizing Research Via Novel Idea Development with LLM Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Long Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiwen Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jiayan Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruochen Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingxuan Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuqian Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Boqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+Y">Yifei Xin</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+R">Ronghao Dang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tian Feng</a>, <a href="/search/cs?searchtype=author&query=Bing%2C+L">Lidong Bing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13185v5-abstract-short" style="display: inline;"> Effective research ideation is a critical step for scientific research. However, the exponential increase in scientific literature makes it challenging for researchers to stay current with recent advances and identify meaningful research directions. Recent developments in large language models~(LLMs) suggest a promising avenue for automating the generation of novel research ideas. However, existin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13185v5-abstract-full').style.display = 'inline'; document.getElementById('2410.13185v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13185v5-abstract-full" style="display: none;"> Effective research ideation is a critical step for scientific research. However, the exponential increase in scientific literature makes it challenging for researchers to stay current with recent advances and identify meaningful research directions. Recent developments in large language models~(LLMs) suggest a promising avenue for automating the generation of novel research ideas. However, existing methods for idea generation either trivially prompt LLMs or directly expose LLMs to extensive literature without indicating useful information. Inspired by the research process of human researchers, we propose a Chain-of-Ideas~(CoI) agent, an LLM-based agent that organizes relevant literature in a chain structure to effectively mirror the progressive development in a research domain. This organization facilitates LLMs to capture the current advancements in research, thereby enhancing their ideation capabilities. Furthermore, we propose Idea Arena, an evaluation protocol that can comprehensively evaluate idea generation methods from different perspectives, aligning closely with the preferences of human researchers. Experimental results indicate that the CoI agent consistently outperforms other methods and shows comparable quality as humans in research idea generation. Moreover, our CoI agent is budget-friendly, with a minimum cost of \$0.50 to generate a candidate idea and its corresponding experimental design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13185v5-abstract-full').style.display = 'none'; document.getElementById('2410.13185v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages,5 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11001">arXiv:2410.11001</a> <span> [<a href="https://arxiv.org/pdf/2410.11001">pdf</a>, <a href="https://arxiv.org/format/2410.11001">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph of Records: Boosting Retrieval Augmented Generation for Long-context Summarization with Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haozhen Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11001v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) has revitalized Large Language Models (LLMs) by injecting non-parametric factual knowledge. Compared with long-context LLMs, RAG is considered an effective summarization tool in a more concise and lightweight manner, which can interact with LLMs multiple times using diverse queries to get comprehensive responses. However, the LLM-generated historical responses,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11001v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11001v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11001v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) has revitalized Large Language Models (LLMs) by injecting non-parametric factual knowledge. Compared with long-context LLMs, RAG is considered an effective summarization tool in a more concise and lightweight manner, which can interact with LLMs multiple times using diverse queries to get comprehensive responses. However, the LLM-generated historical responses, which contain potentially insightful information, are largely neglected and discarded by existing approaches, leading to suboptimal results. In this paper, we propose \textit{graph of records} (\textbf{GoR}), which leverages historical responses generated by LLMs to enhance RAG for long-context global summarization. Inspired by the \textit{retrieve-then-generate} paradigm of RAG, we construct a graph by establishing an edge between the retrieved text chunks and the corresponding LLM-generated response. To further uncover the intricate correlations between them, GoR further features a \textit{graph neural network} and an elaborately designed \textit{BERTScore}-based objective for self-supervised model training, enabling seamless supervision signal backpropagation between reference summaries and node embeddings. We comprehensively compare GoR with 12 baselines across four long-context summarization datasets, and the results indicate that our proposed method reaches the best performance e.g., 15\%, 8\%, and 19\% improvement over retrievers w.r.t. Rouge-L, Rouge-1, and Rouge-2 on the WCEP dataset). Extensive experiments further demonstrate the effectiveness of GoR. Code is available at https://github.com/ulab-uiuc/GoR <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11001v1-abstract-full').style.display = 'none'; document.getElementById('2410.11001v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08114">arXiv:2410.08114</a> <span> [<a href="https://arxiv.org/pdf/2410.08114">pdf</a>, <a href="https://arxiv.org/format/2410.08114">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Parameter-Efficient Fine-Tuning in Spectral Domain for Point Cloud Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dingkang Liang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tianrui Feng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xin Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yumeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhikang Zou</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+X">Xiang Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08114v1-abstract-short" style="display: inline;"> Recently, leveraging pre-training techniques to enhance point cloud models has become a hot research topic. However, existing approaches typically require full fine-tuning of pre-trained models to achieve satisfied performance on downstream tasks, accompanying storage-intensive and computationally demanding. To address this issue, we propose a novel Parameter-Efficient Fine-Tuning (PEFT) method fo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08114v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08114v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08114v1-abstract-full" style="display: none;"> Recently, leveraging pre-training techniques to enhance point cloud models has become a hot research topic. However, existing approaches typically require full fine-tuning of pre-trained models to achieve satisfied performance on downstream tasks, accompanying storage-intensive and computationally demanding. To address this issue, we propose a novel Parameter-Efficient Fine-Tuning (PEFT) method for point cloud, called PointGST (Point cloud Graph Spectral Tuning). PointGST freezes the pre-trained model and introduces a lightweight, trainable Point Cloud Spectral Adapter (PCSA) to fine-tune parameters in the spectral domain. The core idea is built on two observations: 1) The inner tokens from frozen models might present confusion in the spatial domain; 2) Task-specific intrinsic information is important for transferring the general knowledge to the downstream task. Specifically, PointGST transfers the point tokens from the spatial domain to the spectral domain, effectively de-correlating confusion among tokens via using orthogonal components for separating. Moreover, the generated spectral basis involves intrinsic information about the downstream point clouds, enabling more targeted tuning. As a result, PointGST facilitates the efficient transfer of general knowledge to downstream tasks while significantly reducing training costs. Extensive experiments on challenging point cloud datasets across various tasks demonstrate that PointGST not only outperforms its fully fine-tuning counterpart but also significantly reduces trainable parameters, making it a promising solution for efficient point cloud learning. It improves upon a solid baseline by +2.28%, 1.16%, and 2.78%, resulting in 99.48%, 97.76%, and 96.18% on the ScanObjNN OBJ BG, OBJ OBLY, and PB T50 RS datasets, respectively. This advancement establishes a new state-of-the-art, using only 0.67% of the trainable parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08114v1-abstract-full').style.display = 'none'; document.getElementById('2410.08114v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code will be made available at https://github.com/jerryfeng2003/PointGST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03834">arXiv:2410.03834</a> <span> [<a href="https://arxiv.org/pdf/2410.03834">pdf</a>, <a href="https://arxiv.org/format/2410.03834">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GraphRouter: A Graph-based Router for LLM Selections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yanzhen Shen</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03834v1-abstract-short" style="display: inline;"> The rapidly growing number and variety of Large Language Models (LLMs) present significant challenges in efficiently selecting the appropriate LLM for a given query, especially considering the trade-offs between performance and computational cost. Current LLM selection methods often struggle to generalize across new LLMs and different tasks because of their limited ability to leverage contextual i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03834v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03834v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03834v1-abstract-full" style="display: none;"> The rapidly growing number and variety of Large Language Models (LLMs) present significant challenges in efficiently selecting the appropriate LLM for a given query, especially considering the trade-offs between performance and computational cost. Current LLM selection methods often struggle to generalize across new LLMs and different tasks because of their limited ability to leverage contextual interactions among tasks, queries, and LLMs, as well as their dependence on a transductive learning framework. To address these shortcomings, we introduce a novel inductive graph framework, named as GraphRouter, which fully utilizes the contextual information among tasks, queries, and LLMs to enhance the LLM selection process. GraphRouter constructs a heterogeneous graph comprising task, query, and LLM nodes, with interactions represented as edges, which efficiently captures the contextual information between the query's requirements and the LLM's capabilities. Through an innovative edge prediction mechanism, GraphRouter is able to predict attributes (the effect and cost of LLM response) of potential edges, allowing for optimized recommendations that adapt to both existing and newly introduced LLMs without requiring retraining. Comprehensive experiments across three distinct effect-cost weight scenarios have shown that GraphRouter substantially surpasses existing routers, delivering a minimum performance improvement of 12.3%. In addition, it achieves enhanced generalization across new LLMs settings and supports diverse tasks with at least a 9.5% boost in effect and a significant reduction in computational demands. This work endeavors to apply a graph-based approach for the contextual and adaptive selection of LLMs, offering insights for real-world applications. Our codes for GraphRouter will soon be released at https://github.com/ulab-uiuc/GraphRouter. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03834v1-abstract-full').style.display = 'none'; document.getElementById('2410.03834v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15525">arXiv:2409.15525</a> <span> [<a href="https://arxiv.org/pdf/2409.15525">pdf</a>, <a href="https://arxiv.org/format/2409.15525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speech2rtMRI: Speech-Guided Diffusion Model for Real-time MRI Video of the Vocal Tract during Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Hong Nguyen</a>, <a href="/search/cs?searchtype=author&query=Foley%2C+S">Sean Foley</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kevin Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xuan Shi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15525v1-abstract-short" style="display: inline;"> Understanding speech production both visually and kinematically can inform second language learning system designs, as well as the creation of speaking characters in video games and animations. In this work, we introduce a data-driven method to visually represent articulator motion in Magnetic Resonance Imaging (MRI) videos of the human vocal tract during speech based on arbitrary audio or speech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15525v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15525v1-abstract-full" style="display: none;"> Understanding speech production both visually and kinematically can inform second language learning system designs, as well as the creation of speaking characters in video games and animations. In this work, we introduce a data-driven method to visually represent articulator motion in Magnetic Resonance Imaging (MRI) videos of the human vocal tract during speech based on arbitrary audio or speech input. We leverage large pre-trained speech models, which are embedded with prior knowledge, to generalize the visual domain to unseen data using a speech-to-video diffusion model. Our findings demonstrate that the visual generation significantly benefits from the pre-trained speech representations. We also observed that evaluating phonemes in isolation is challenging but becomes more straightforward when assessed within the context of spoken words. Limitations of the current results include the presence of unsmooth tongue motion and video distortion when the tongue contacts the palate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15525v1-abstract-full').style.display = 'none'; document.getElementById('2409.15525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13606">arXiv:2409.13606</a> <span> [<a href="https://arxiv.org/pdf/2409.13606">pdf</a>, <a href="https://arxiv.org/format/2409.13606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Child-Inclusive Clinical Video Understanding for Autism Spectrum Disorder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kommineni%2C+A">Aditya Kommineni</a>, <a href="/search/cs?searchtype=author&query=Bose%2C+D">Digbalay Bose</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+H">So Hyun Kim</a>, <a href="/search/cs?searchtype=author&query=Tager-Flusberg%2C+H">Helen Tager-Flusberg</a>, <a href="/search/cs?searchtype=author&query=Bishop%2C+S">Somer Bishop</a>, <a href="/search/cs?searchtype=author&query=Lord%2C+C">Catherine Lord</a>, <a href="/search/cs?searchtype=author&query=Kadiri%2C+S">Sudarsana Kadiri</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13606v1-abstract-short" style="display: inline;"> Clinical videos in the context of Autism Spectrum Disorder are often long-form interactions between children and caregivers/clinical professionals, encompassing complex verbal and non-verbal behaviors. Objective analyses of these videos could provide clinicians and researchers with nuanced insights into the behavior of children with Autism Spectrum Disorder. Manually coding these videos is a time-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13606v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13606v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13606v1-abstract-full" style="display: none;"> Clinical videos in the context of Autism Spectrum Disorder are often long-form interactions between children and caregivers/clinical professionals, encompassing complex verbal and non-verbal behaviors. Objective analyses of these videos could provide clinicians and researchers with nuanced insights into the behavior of children with Autism Spectrum Disorder. Manually coding these videos is a time-consuming task and requires a high level of domain expertise. Hence, the ability to capture these interactions computationally can augment the manual effort and enable supporting the diagnostic procedure. In this work, we investigate the use of foundation models across three modalities: speech, video, and text, to analyse child-focused interaction sessions. We propose a unified methodology to combine multiple modalities by using large language models as reasoning agents. We evaluate their performance on two tasks with different information granularity: activity recognition and abnormal behavior detection. We find that the proposed multimodal pipeline provides robustness to modality-specific limitations and improves performance on the clinical video analysis compared to unimodal settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13606v1-abstract-full').style.display = 'none'; document.getElementById('2409.13606v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10176">arXiv:2409.10176</a> <span> [<a href="https://arxiv.org/pdf/2409.10176">pdf</a>, <a href="https://arxiv.org/format/2409.10176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> TCDformer-based Momentum Transfer Model for Long-term Sports Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiacheng Gu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiyuan Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Junjie Shi</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&query=He%2C+N">Ning He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10176v1-abstract-short" style="display: inline;"> Accurate sports prediction is a crucial skill for professional coaches, which can assist in developing effective training strategies and scientific competition tactics. Traditional methods often use complex mathematical statistical techniques to boost predictability, but this often is limited by dataset scale and has difficulty handling long-term predictions with variable distributions, notably un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10176v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10176v1-abstract-full" style="display: none;"> Accurate sports prediction is a crucial skill for professional coaches, which can assist in developing effective training strategies and scientific competition tactics. Traditional methods often use complex mathematical statistical techniques to boost predictability, but this often is limited by dataset scale and has difficulty handling long-term predictions with variable distributions, notably underperforming when predicting point-set-game multi-level matches. To deal with this challenge, this paper proposes TM2, a TCDformer-based Momentum Transfer Model for long-term sports prediction, which encompasses a momentum encoding module and a prediction module based on momentum transfer. TM2 initially encodes momentum in large-scale unstructured time series using the local linear scaling approximation (LLSA) module. Then it decomposes the reconstructed time series with momentum transfer into trend and seasonal components. The final prediction results are derived from the additive combination of a multilayer perceptron (MLP) for predicting trend components and wavelet attention mechanisms for seasonal components. Comprehensive experimental results show that on the 2023 Wimbledon men's tournament datasets, TM2 significantly surpasses existing sports prediction models in terms of performance, reducing MSE by 61.64% and MAE by 63.64%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10176v1-abstract-full').style.display = 'none'; document.getElementById('2409.10176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under reviewing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09572">arXiv:2409.09572</a> <span> [<a href="https://arxiv.org/pdf/2409.09572">pdf</a>, <a href="https://arxiv.org/format/2409.09572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Novel Aerial-Aquatic Locomotion Robot with Variable Stiffness Propulsion Module </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junzhe Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tianxiang Feng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuxuan Wen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Ke Wu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+J">Janet Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09572v1-abstract-short" style="display: inline;"> In recent years, the development of robots capable of operating in both aerial and aquatic environments has gained significant attention. This study presents the design and fabrication of a novel aerial-aquatic locomotion robot (AALR). Inspired by the diving beetle, the AALR incorporates a biomimetic propulsion mechanism with power and recovery strokes. The variable stiffness propulsion module (VS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09572v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09572v1-abstract-full" style="display: none;"> In recent years, the development of robots capable of operating in both aerial and aquatic environments has gained significant attention. This study presents the design and fabrication of a novel aerial-aquatic locomotion robot (AALR). Inspired by the diving beetle, the AALR incorporates a biomimetic propulsion mechanism with power and recovery strokes. The variable stiffness propulsion module (VSPM) uses low melting point alloy (LMPA) and variable stiffness joints (VSJ) to achieve efficient aquatic locomotion while reduce harm to marine life. The AALR's innovative design integrates the VSPM into the arms of a traditional quadrotor, allowing for effective aerial-aquatic locomotion. The VSPM adjusts joint stiffness through temperature control, meeting locomotion requirements in both aerial and aquatic modes. A dynamic model for the VSPM was developed, with optimized dimensional parameters to increase propulsion force. Experiments focused on aquatic mode analysis and demonstrated the AALR's swimming capability, achieving a maximum swimming speed of 77 mm/s underwater. The results confirm the AALR's effective performance in water environment, highlighting its potential for versatile, eco-friendly operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09572v1-abstract-full').style.display = 'none'; document.getElementById('2409.09572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 10 figures, ICRA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09340">arXiv:2409.09340</a> <span> [<a href="https://arxiv.org/pdf/2409.09340">pdf</a>, <a href="https://arxiv.org/format/2409.09340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Egocentric Speaker Classification in Child-Adult Dyadic Interactions: From Sensing to Computational Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+A">Anfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xuan Shi</a>, <a href="/search/cs?searchtype=author&query=Bishop%2C+S">Somer Bishop</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09340v1-abstract-short" style="display: inline;"> Autism spectrum disorder (ASD) is a neurodevelopmental condition characterized by challenges in social communication, repetitive behavior, and sensory processing. One important research area in ASD is evaluating children's behavioral changes over time during treatment. The standard protocol with this objective is BOSCC, which involves dyadic interactions between a child and clinicians performing a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09340v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09340v1-abstract-full" style="display: none;"> Autism spectrum disorder (ASD) is a neurodevelopmental condition characterized by challenges in social communication, repetitive behavior, and sensory processing. One important research area in ASD is evaluating children's behavioral changes over time during treatment. The standard protocol with this objective is BOSCC, which involves dyadic interactions between a child and clinicians performing a pre-defined set of activities. A fundamental aspect of understanding children's behavior in these interactions is automatic speech understanding, particularly identifying who speaks and when. Conventional approaches in this area heavily rely on speech samples recorded from a spectator perspective, and there is limited research on egocentric speech modeling. In this study, we design an experiment to perform speech sampling in BOSCC interviews from an egocentric perspective using wearable sensors and explore pre-training Ego4D speech samples to enhance child-adult speaker classification in dyadic interactions. Our findings highlight the potential of egocentric speech collection and pre-training to improve speaker classification accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09340v1-abstract-full').style.display = 'none'; document.getElementById('2409.09340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">pre-print under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04593">arXiv:2409.04593</a> <span> [<a href="https://arxiv.org/pdf/2409.04593">pdf</a>, <a href="https://arxiv.org/format/2409.04593">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Paper Copilot: A Self-Evolving and Efficient LLM System for Personalized Academic Assistance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+G">Guanyu Lin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+P">Pengrui Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Ge Liu</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04593v1-abstract-short" style="display: inline;"> As scientific research proliferates, researchers face the daunting task of navigating and reading vast amounts of literature. Existing solutions, such as document QA, fail to provide personalized and up-to-date information efficiently. We present Paper Copilot, a self-evolving, efficient LLM system designed to assist researchers, based on thought-retrieval, user profile and high performance optimi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04593v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04593v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04593v1-abstract-full" style="display: none;"> As scientific research proliferates, researchers face the daunting task of navigating and reading vast amounts of literature. Existing solutions, such as document QA, fail to provide personalized and up-to-date information efficiently. We present Paper Copilot, a self-evolving, efficient LLM system designed to assist researchers, based on thought-retrieval, user profile and high performance optimization. Specifically, Paper Copilot can offer personalized research services, maintaining a real-time updated database. Quantitative evaluation demonstrates that Paper Copilot saves 69.92\% of time after efficient deployment. This paper details the design and implementation of Paper Copilot, highlighting its contributions to personalized academic support and its potential to streamline the research process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04593v1-abstract-full').style.display = 'none'; document.getElementById('2409.04593v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04473">arXiv:2409.04473</a> <span> [<a href="https://arxiv.org/pdf/2409.04473">pdf</a>, <a href="https://arxiv.org/format/2409.04473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning in Order! A Sequential Strategy to Learn Invariant Features for Multimodal Sentiment Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xianbing Zhao</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jianfei Cai</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Buzhou Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04473v1-abstract-short" style="display: inline;"> This work proposes a novel and simple sequential learning strategy to train models on videos and texts for multimodal sentiment analysis. To estimate sentiment polarities on unseen out-of-distribution data, we introduce a multimodal model that is trained either in a single source domain or multiple source domains using our learning strategy. This strategy starts with learning domain invariant feat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04473v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04473v1-abstract-full" style="display: none;"> This work proposes a novel and simple sequential learning strategy to train models on videos and texts for multimodal sentiment analysis. To estimate sentiment polarities on unseen out-of-distribution data, we introduce a multimodal model that is trained either in a single source domain or multiple source domains using our learning strategy. This strategy starts with learning domain invariant features from text, followed by learning sparse domain-agnostic features from videos, assisted by the selected features learned in text. Our experimental results demonstrate that our model achieves significantly better performance than the state-of-the-art approaches on average in both single-source and multi-source settings. Our feature selection procedure favors the features that are independent to each other and are strongly correlated with their polarity labels. To facilitate research on this topic, the source code of this work will be publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04473v1-abstract-full').style.display = 'none'; document.getElementById('2409.04473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17031">arXiv:2408.17031</a> <span> [<a href="https://arxiv.org/pdf/2408.17031">pdf</a>, <a href="https://arxiv.org/format/2408.17031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Meta-UAD: A Meta-Learning Scheme for User-level Network Traffic Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lingqi Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17031v1-abstract-short" style="display: inline;"> Accuracy anomaly detection in user-level network traffic is crucial for network security. Compared with existing models that passively detect specific anomaly classes with large labeled training samples, user-level network traffic contains sizeable new anomaly classes with few labeled samples and has an imbalance, self-similar, and data-hungry nature. Motivation on those limitations, in this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17031v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17031v1-abstract-full" style="display: none;"> Accuracy anomaly detection in user-level network traffic is crucial for network security. Compared with existing models that passively detect specific anomaly classes with large labeled training samples, user-level network traffic contains sizeable new anomaly classes with few labeled samples and has an imbalance, self-similar, and data-hungry nature. Motivation on those limitations, in this paper, we propose \textit{Meta-UAD}, a Meta-learning scheme for User-level network traffic Anomaly Detection. Meta-UAD uses the CICFlowMeter to extract 81 flow-level statistical features and remove some invalid ones using cumulative importance ranking. Meta-UAD adopts a meta-learning training structure and learns from the collection of K-way-M-shot classification tasks, which can use a pre-trained model to adapt any new class with few samples by few iteration steps. We evaluate our scheme on two public datasets. Compared with existing models, the results further demonstrate the superiority of Meta-UAD with 15{\%} - 43{\%} gains in F1-score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17031v1-abstract-full').style.display = 'none'; document.getElementById('2408.17031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under reviewing. arXiv admin note: substantial text overlap with arXiv:2408.14884</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17028">arXiv:2408.17028</a> <span> [<a href="https://arxiv.org/pdf/2408.17028">pdf</a>, <a href="https://arxiv.org/format/2408.17028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Deadline and Priority Constrained Immersive Video Streaming Transmission Scheduling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bo He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17028v1-abstract-short" style="display: inline;"> Deadline-aware transmission scheduling in immersive video streaming is crucial. The objective is to guarantee that at least a certain block in multi-links is fully delivered within their deadlines, which is referred to as delivery ratio. Compared with existing models that focus on maximizing throughput and ultra-low latency, which makes bandwidth resource allocation and user satisfaction locally o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17028v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17028v1-abstract-full" style="display: none;"> Deadline-aware transmission scheduling in immersive video streaming is crucial. The objective is to guarantee that at least a certain block in multi-links is fully delivered within their deadlines, which is referred to as delivery ratio. Compared with existing models that focus on maximizing throughput and ultra-low latency, which makes bandwidth resource allocation and user satisfaction locally optimized, immersive video streaming needs to guarantee more high-priority block delivery within personalized deadlines. In this paper, we propose a deadline and priority-constrained immersive video streaming transmission scheduling scheme. It builds an accurate bandwidth prediction model that can sensitively assist scheduling decisions. It divides video streaming into various media elements and performs scheduling based on the user's personalized latency sensitivity thresholds and the media element's priority. We evaluate our scheme via trace-driven simulations. Compared with existing models, the results further demonstrate the superiority of our scheme with 12{\%}-31{\%} gains in quality of experience (QoE). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17028v1-abstract-full').style.display = 'none'; document.getElementById('2408.17028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under reviewing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15971">arXiv:2408.15971</a> <span> [<a href="https://arxiv.org/pdf/2408.15971">pdf</a>, <a href="https://arxiv.org/format/2408.15971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BattleAgentBench: A Benchmark for Evaluating Cooperation and Competition Capabilities of Language Models in Multi-Agent Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyan Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15971v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are becoming increasingly powerful and capable of handling complex tasks, e.g., building single agents and multi-agent systems. Compared to single agents, multi-agent systems have higher requirements for the collaboration capabilities of language models. Many benchmarks are proposed to evaluate their collaborative abilities. However, these benchmarks lack fine-grained… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15971v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15971v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15971v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are becoming increasingly powerful and capable of handling complex tasks, e.g., building single agents and multi-agent systems. Compared to single agents, multi-agent systems have higher requirements for the collaboration capabilities of language models. Many benchmarks are proposed to evaluate their collaborative abilities. However, these benchmarks lack fine-grained evaluations of LLM collaborative capabilities. Additionally, multi-agent collaborative and competitive scenarios are ignored in existing works. To address these two problems, we propose a benchmark, called BattleAgentBench, which defines seven sub-stages of three varying difficulty levels and conducts a fine-grained evaluation of language models in terms of single-agent scenario navigation capabilities, paired-agent task execution abilities, and multi-agent collaboration and competition capabilities. We conducted extensive evaluations on leading four closed-source and seven open-source models. Experimental results indicate that API-based models perform excellently on simple tasks but open-source small models struggle with simple tasks. Regarding difficult tasks that require collaborative and competitive abilities, although API-based models have demonstrated some collaborative capabilities, there is still enormous room for improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15971v1-abstract-full').style.display = 'none'; document.getElementById('2408.15971v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15803">arXiv:2408.15803</a> <span> [<a href="https://arxiv.org/pdf/2408.15803">pdf</a>, <a href="https://arxiv.org/format/2408.15803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ModalityMirror: Improving Audio Classification in Modality Heterogeneity Federated Learning with Multimodal Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Avestimehr%2C+S">Salman Avestimehr</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S+S">Shrikanth S. Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15803v1-abstract-short" style="display: inline;"> Multimodal Federated Learning frequently encounters challenges of client modality heterogeneity, leading to undesired performances for secondary modality in multimodal learning. It is particularly prevalent in audiovisual learning, with audio is often assumed to be the weaker modality in recognition tasks. To address this challenge, we introduce ModalityMirror to improve audio model performance by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15803v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15803v1-abstract-full" style="display: none;"> Multimodal Federated Learning frequently encounters challenges of client modality heterogeneity, leading to undesired performances for secondary modality in multimodal learning. It is particularly prevalent in audiovisual learning, with audio is often assumed to be the weaker modality in recognition tasks. To address this challenge, we introduce ModalityMirror to improve audio model performance by leveraging knowledge distillation from an audiovisual federated learning model. ModalityMirror involves two phases: a modality-wise FL stage to aggregate uni-modal encoders; and a federated knowledge distillation stage on multi-modality clients to train an unimodal student model. Our results demonstrate that ModalityMirror significantly improves the audio classification compared to the state-of-the-art FL methods such as Harmony, particularly in audiovisual FL facing video missing. Our approach unlocks the potential for exploiting the diverse modality spectrum inherent in multi-modal FL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15803v1-abstract-full').style.display = 'none'; document.getElementById('2408.15803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14884">arXiv:2408.14884</a> <span> [<a href="https://arxiv.org/pdf/2408.14884">pdf</a>, <a href="https://arxiv.org/format/2408.14884">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Multimedia Traffic Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14884v3-abstract-short" style="display: inline;"> Accuracy anomaly detection in user-level social multimedia traffic is crucial for privacy security. Compared with existing models that passively detect specific anomaly classes with large labeled training samples, user-level social multimedia traffic contains sizeable new anomaly classes with few labeled samples and has an imbalance, self-similar, and data-hungry nature. Recent advances, such as G… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14884v3-abstract-full').style.display = 'inline'; document.getElementById('2408.14884v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14884v3-abstract-full" style="display: none;"> Accuracy anomaly detection in user-level social multimedia traffic is crucial for privacy security. Compared with existing models that passively detect specific anomaly classes with large labeled training samples, user-level social multimedia traffic contains sizeable new anomaly classes with few labeled samples and has an imbalance, self-similar, and data-hungry nature. Recent advances, such as Generative Adversarial Networks (GAN), solve it by learning a sample generator only from seen class samples to synthesize new samples. However, if we detect many new classes, the number of synthesizing samples would be unfeasibly estimated, and this operation will drastically increase computational complexity and energy consumption. Motivation on these limitations, in this paper, we propose \textit{Meta-UAD}, a Meta-learning scheme for User-level social multimedia traffic Anomaly Detection. This scheme relies on the episodic training paradigm and learns from the collection of K-way-M-shot classification tasks, which can use the pre-trained model to adapt any new class with few samples by going through few iteration steps. Since user-level social multimedia traffic emerges from a complex interaction process of users and social applications, we further develop a feature extractor to improve scheme performance. It extracts statistical features using cumulative importance ranking and time-series features using an LSTM-based AutoEncoder. We evaluate our scheme on two public datasets and the results further demonstrate the superiority of Meta-UAD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14884v3-abstract-full').style.display = 'none'; document.getElementById('2408.14884v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02408">arXiv:2408.02408</a> <span> [<a href="https://arxiv.org/pdf/2408.02408">pdf</a>, <a href="https://arxiv.org/format/2408.02408">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3689095.3689103">10.1145/3689095.3689103 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multi-weather Cross-view Geo-localization Using Denoising Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qing Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingzi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guangyao Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenwu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02408v2-abstract-short" style="display: inline;"> Cross-view geo-localization in GNSS-denied environments aims to determine an unknown location by matching drone-view images with the correct geo-tagged satellite-view images from a large gallery. Recent research shows that learning discriminative image representations under specific weather conditions can significantly enhance performance. However, the frequent occurrence of unseen extreme weather… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02408v2-abstract-full').style.display = 'inline'; document.getElementById('2408.02408v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02408v2-abstract-full" style="display: none;"> Cross-view geo-localization in GNSS-denied environments aims to determine an unknown location by matching drone-view images with the correct geo-tagged satellite-view images from a large gallery. Recent research shows that learning discriminative image representations under specific weather conditions can significantly enhance performance. However, the frequent occurrence of unseen extreme weather conditions hinders progress. This paper introduces MCGF, a Multi-weather Cross-view Geo-localization Framework designed to dynamically adapt to unseen weather conditions. MCGF establishes a joint optimization between image restoration and geo-localization using denoising diffusion models. For image restoration, MCGF incorporates a shared encoder and a lightweight restoration module to help the backbone eliminate weather-specific information. For geo-localization, MCGF uses EVA-02 as a backbone for feature extraction, with cross-entropy loss for training and cosine distance for testing. Extensive experiments on University160k-WX demonstrate that MCGF achieves competitive results for geo-localization in varying weather conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02408v2-abstract-full').style.display = 'none'; document.getElementById('2408.02408v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM24 workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00606">arXiv:2408.00606</a> <span> [<a href="https://arxiv.org/pdf/2408.00606">pdf</a>, <a href="https://arxiv.org/format/2408.00606">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681151">10.1145/3664647.3681151 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> U2UData: A Large-scale Cooperative Perception Dataset for Swarm UAVs Autonomous Flight </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tongtong Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+F">Feilin Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Leping Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenwu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00606v3-abstract-short" style="display: inline;"> Modern perception systems for autonomous flight are sensitive to occlusion and have limited long-range capability, which is a key bottleneck in improving low-altitude economic task performance. Recent research has shown that the UAV-to-UAV (U2U) cooperative perception system has great potential to revolutionize the autonomous flight industry. However, the lack of a large-scale dataset is hindering… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00606v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00606v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00606v3-abstract-full" style="display: none;"> Modern perception systems for autonomous flight are sensitive to occlusion and have limited long-range capability, which is a key bottleneck in improving low-altitude economic task performance. Recent research has shown that the UAV-to-UAV (U2U) cooperative perception system has great potential to revolutionize the autonomous flight industry. However, the lack of a large-scale dataset is hindering progress in this area. This paper presents U2UData, the first large-scale cooperative perception dataset for swarm UAVs autonomous flight. The dataset was collected by three UAVs flying autonomously in the U2USim, covering a 9 km$^2$ flight area. It comprises 315K LiDAR frames, 945K RGB and depth frames, and 2.41M annotated 3D bounding boxes for 3 classes. It also includes brightness, temperature, humidity, smoke, and airflow values covering all flight routes. U2USim is the first real-world mapping swarm UAVs simulation environment. It takes Yunnan Province as the prototype and includes 4 terrains, 7 weather conditions, and 8 sensor types. U2UData introduces two perception tasks: cooperative 3D object detection and cooperative 3D object tracking. This paper provides comprehensive benchmarks of recent cooperative perception algorithms on these tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00606v3-abstract-full').style.display = 'none'; document.getElementById('2408.00606v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19638">arXiv:2407.19638</a> <span> [<a href="https://arxiv.org/pdf/2407.19638">pdf</a>, <a href="https://arxiv.org/format/2407.19638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Pre-training Corpora to Large Language Models: What Factors Influence LLM Performance in Causal Discovery Tasks? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Tandon%2C+N">Niket Tandon</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiaoxi Kang</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19638v1-abstract-short" style="display: inline;"> Recent advances in artificial intelligence have seen Large Language Models (LLMs) demonstrate notable proficiency in causal discovery tasks. This study explores the factors influencing the performance of LLMs in causal discovery tasks. Utilizing open-source LLMs, we examine how the frequency of causal relations within their pre-training corpora affects their ability to accurately respond to causal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19638v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19638v1-abstract-full" style="display: none;"> Recent advances in artificial intelligence have seen Large Language Models (LLMs) demonstrate notable proficiency in causal discovery tasks. This study explores the factors influencing the performance of LLMs in causal discovery tasks. Utilizing open-source LLMs, we examine how the frequency of causal relations within their pre-training corpora affects their ability to accurately respond to causal discovery queries. Our findings reveal that a higher frequency of causal mentions correlates with better model performance, suggesting that extensive exposure to causal information during training enhances the models' causal discovery capabilities. Additionally, we investigate the impact of context on the validity of causal relations. Our results indicate that LLMs might exhibit divergent predictions for identical causal relations when presented in different contexts. This paper provides the first comprehensive analysis of how different factors contribute to LLM performance in causal discovery tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19638v1-abstract-full').style.display = 'none'; document.getElementById('2407.19638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10246">arXiv:2407.10246</a> <span> [<a href="https://arxiv.org/pdf/2407.10246">pdf</a>, <a href="https://arxiv.org/ps/2407.10246">ps</a>, <a href="https://arxiv.org/format/2407.10246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> CourseAssist: Pedagogically Appropriate AI Tutor for Computer Science Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Ty Feng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sa Liu</a>, <a href="/search/cs?searchtype=author&query=Ghosal%2C+D">Dipak Ghosal</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10246v3-abstract-short" style="display: inline;"> The growing enrollments in computer science courses and increase in class sizes necessitate scalable, automated tutoring solutions to adequately support student learning. While Large Language Models (LLMs) like GPT-4 have demonstrated potential in assisting students through question-answering, educators express concerns over student overreliance, miscomprehension of generated code, and the risk of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10246v3-abstract-full').style.display = 'inline'; document.getElementById('2407.10246v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10246v3-abstract-full" style="display: none;"> The growing enrollments in computer science courses and increase in class sizes necessitate scalable, automated tutoring solutions to adequately support student learning. While Large Language Models (LLMs) like GPT-4 have demonstrated potential in assisting students through question-answering, educators express concerns over student overreliance, miscomprehension of generated code, and the risk of inaccurate answers. Rather than banning these tools outright, we advocate for a constructive approach that harnesses the capabilities of AI while mitigating potential risks. This poster introduces CourseAssist, a novel LLM-based tutoring system tailored for computer science education. Unlike generic LLM systems, CourseAssist uses retrieval-augmented generation, user intent classification, and question decomposition to align AI responses with specific course materials and learning objectives, thereby ensuring pedagogical appropriateness of LLMs in educational settings. We evaluated CourseAssist against a baseline of GPT-4 using a dataset of 50 question-answer pairs from a programming languages course, focusing on the criteria of usefulness, accuracy, and pedagogical appropriateness. Evaluation results show that CourseAssist significantly outperforms the baseline, demonstrating its potential to serve as an effective learning assistant. We have also deployed CourseAssist in 6 computer science courses at a large public R1 research university reaching over 500 students. Interviews with 20 student users show that CourseAssist improves computer science instruction by increasing the accessibility of course-specific tutoring help and shortening the feedback loop on their programming assignments. Future work will include extensive pilot testing at more universities and exploring better collaborative relationships between students, educators, and AI that improve computer science learning experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10246v3-abstract-full').style.display = 'none'; document.getElementById('2407.10246v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIGCSE Virtual 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10200">arXiv:2407.10200</a> <span> [<a href="https://arxiv.org/pdf/2407.10200">pdf</a>, <a href="https://arxiv.org/format/2407.10200">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Shape2Scene: 3D Scene Representation Learning Through Pre-training on Shape Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tuo Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenguan Wang</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+R">Ruijie Quan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10200v1-abstract-short" style="display: inline;"> Current 3D self-supervised learning methods of 3D scenes face a data desert issue, resulting from the time-consuming and expensive collecting process of 3D scene data. Conversely, 3D shape datasets are easier to collect. Despite this, existing pre-training strategies on shape data offer limited potential for 3D scene understanding due to significant disparities in point quantities. To tackle these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10200v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10200v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10200v1-abstract-full" style="display: none;"> Current 3D self-supervised learning methods of 3D scenes face a data desert issue, resulting from the time-consuming and expensive collecting process of 3D scene data. Conversely, 3D shape datasets are easier to collect. Despite this, existing pre-training strategies on shape data offer limited potential for 3D scene understanding due to significant disparities in point quantities. To tackle these challenges, we propose Shape2Scene (S2S), a novel method that learns representations of large-scale 3D scenes from 3D shape data. We first design multiscale and high-resolution backbones for shape and scene level 3D tasks, i.e., MH-P (point-based) and MH-V (voxel-based). MH-P/V establishes direct paths to highresolution features that capture deep semantic information across multiple scales. This pivotal nature makes them suitable for a wide range of 3D downstream tasks that tightly rely on high-resolution features. We then employ a Shape-to-Scene strategy (S2SS) to amalgamate points from various shapes, creating a random pseudo scene (comprising multiple objects) for training data, mitigating disparities between shapes and scenes. Finally, a point-point contrastive loss (PPC) is applied for the pre-training of MH-P/V. In PPC, the inherent correspondence (i.e., point pairs) is naturally obtained in S2SS. Extensive experiments have demonstrated the transferability of 3D representations learned by MH-P/V across shape-level and scene-level 3D tasks. MH-P achieves notable performance on well-known point cloud datasets (93.8% OA on ScanObjectNN and 87.6% instance mIoU on ShapeNetPart). MH-V also achieves promising performance in 3D semantic segmentation and 3D object detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10200v1-abstract-full').style.display = 'none'; document.getElementById('2407.10200v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ECCV 2024; Project page: https://github.com/FengZicai/S2S</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04486">arXiv:2407.04486</a> <span> [<a href="https://arxiv.org/pdf/2407.04486">pdf</a>, <a href="https://arxiv.org/format/2407.04486">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Variational and Explanatory Neural Networks for Encoding Cancer Profiles and Predicting Drug Responses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tianshu Feng</a>, <a href="/search/cs?searchtype=author&query=Gnanaolivu%2C+R">Rohan Gnanaolivu</a>, <a href="/search/cs?searchtype=author&query=Safikhani%2C+A">Abolfazl Safikhani</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuanhang Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jun Jiang</a>, <a href="/search/cs?searchtype=author&query=Chia%2C+N">Nicholas Chia</a>, <a href="/search/cs?searchtype=author&query=Partin%2C+A">Alexander Partin</a>, <a href="/search/cs?searchtype=author&query=Vasanthakumari%2C+P">Priyanka Vasanthakumari</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yitan Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04486v1-abstract-short" style="display: inline;"> Human cancers present a significant public health challenge and require the discovery of novel drugs through translational research. Transcriptomics profiling data that describes molecular activities in tumors and cancer cell lines are widely utilized for predicting anti-cancer drug responses. However, existing AI models face challenges due to noise in transcriptomics data and lack of biological i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04486v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04486v1-abstract-full" style="display: none;"> Human cancers present a significant public health challenge and require the discovery of novel drugs through translational research. Transcriptomics profiling data that describes molecular activities in tumors and cancer cell lines are widely utilized for predicting anti-cancer drug responses. However, existing AI models face challenges due to noise in transcriptomics data and lack of biological interpretability. To overcome these limitations, we introduce VETE (Variational and Explanatory Transcriptomics Encoder), a novel neural network framework that incorporates a variational component to mitigate noise effects and integrates traceable gene ontology into the neural network architecture for encoding cancer transcriptomics data. Key innovations include a local interpretability-guided method for identifying ontology paths, a visualization tool to elucidate biological mechanisms of drug responses, and the application of centralized large scale hyperparameter optimization. VETE demonstrated robust accuracy in cancer cell line classification and drug response prediction. Additionally, it provided traceable biological explanations for both tasks and offers insights into the mechanisms underlying its predictions. VETE bridges the gap between AI-driven predictions and biologically meaningful insights in cancer research, which represents a promising advancement in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04486v1-abstract-full').style.display = 'none'; document.getElementById('2407.04486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.20015">arXiv:2406.20015</a> <span> [<a href="https://arxiv.org/pdf/2406.20015">pdf</a>, <a href="https://arxiv.org/format/2406.20015">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ToolBeHonest: A Multi-level Hallucination Diagnostic Benchmark for Tool-Augmented Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaxin Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chufan Shi</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zihao Lin</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+H">Hanwen Wan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Sakai%2C+T">Tetsuya Sakai</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tian Feng</a>, <a href="/search/cs?searchtype=author&query=Yamana%2C+H">Hayato Yamana</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.20015v2-abstract-short" style="display: inline;"> Tool-augmented large language models (LLMs) are rapidly being integrated into real-world applications. Due to the lack of benchmarks, the community has yet to fully understand the hallucination issues within these models. To address this challenge, we introduce a comprehensive diagnostic benchmark, ToolBH. Specifically, we assess the LLM's hallucinations through two perspectives: depth and breadth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.20015v2-abstract-full').style.display = 'inline'; document.getElementById('2406.20015v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.20015v2-abstract-full" style="display: none;"> Tool-augmented large language models (LLMs) are rapidly being integrated into real-world applications. Due to the lack of benchmarks, the community has yet to fully understand the hallucination issues within these models. To address this challenge, we introduce a comprehensive diagnostic benchmark, ToolBH. Specifically, we assess the LLM's hallucinations through two perspectives: depth and breadth. In terms of depth, we propose a multi-level diagnostic process, including (1) solvability detection, (2) solution planning, and (3) missing-tool analysis. For breadth, we consider three scenarios based on the characteristics of the toolset: missing necessary tools, potential tools, and limited functionality tools. Furthermore, we developed seven tasks and collected 700 evaluation samples through multiple rounds of manual annotation. The results show the significant challenges presented by the ToolBH benchmark. The current advanced models Gemini-1.5-Pro and GPT-4o only achieve total scores of 45.3 and 37.0, respectively, on a scale of 100. In this benchmark, larger model parameters do not guarantee better performance; the training data and response strategies also play crucial roles in tool-enhanced LLM scenarios. Our diagnostic analysis indicates that the primary reason for model errors lies in assessing task solvability. Additionally, open-weight models suffer from performance drops with verbose replies, whereas proprietary models excel with longer reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.20015v2-abstract-full').style.display = 'none'; document.getElementById('2406.20015v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17300">arXiv:2406.17300</a> <span> [<a href="https://arxiv.org/pdf/2406.17300">pdf</a>, <a href="https://arxiv.org/format/2406.17300">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CausalScore: An Automatic Reference-Free Metric for Assessing Response Relevance in Open-Domain Dialogue Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiaoxi Kang</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17300v1-abstract-short" style="display: inline;"> Automatically evaluating the quality of responses in open-domain dialogue systems is a challenging but crucial task. Current evaluation metrics often fail to align with human judgments, especially when assessing responses that are grammatically correct. To address this issue, we propose a novel metric, called CausalScore, which assesses the relevance of responses by measuring the causal strength b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17300v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17300v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17300v1-abstract-full" style="display: none;"> Automatically evaluating the quality of responses in open-domain dialogue systems is a challenging but crucial task. Current evaluation metrics often fail to align with human judgments, especially when assessing responses that are grammatically correct. To address this issue, we propose a novel metric, called CausalScore, which assesses the relevance of responses by measuring the causal strength between dialogue histories and responses. The causal strength is estimated by utilizing both unconditional dependence and conditional dependencies from the dialogue history to responses. We compare our metric with the existing competitive metrics in terms of their alignment with human judgements. Our experimental results demonstrate that CausalScore significantly surpasses existing state-of-the-art metrics by aligning better with human judgements. Additionally, we collect a new dialogue dataset CGDIALOG+ with human-annotated causal relations and a set of pairwise human judgements to facilitate the development of future automatic metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17300v1-abstract-full').style.display = 'none'; document.getElementById('2406.17300v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.15490">arXiv:2406.15490</a> <span> [<a href="https://arxiv.org/pdf/2406.15490">pdf</a>, <a href="https://arxiv.org/format/2406.15490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Causal Discovery Inspired Unsupervised Domain Adaptation for Emotion-Cause Pair Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hua%2C+Y">Yuncheng Hua</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yujin Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Shuo Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Bain%2C+C">Chris Bain</a>, <a href="/search/cs?searchtype=author&query=Bassed%2C+R">Richard Bassed</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.15490v1-abstract-short" style="display: inline;"> This paper tackles the task of emotion-cause pair extraction in the unsupervised domain adaptation setting. The problem is challenging as the distributions of the events causing emotions in target domains are dramatically different than those in source domains, despite the distributions of emotional expressions between domains are overlapped. Inspired by causal discovery, we propose a novel deep l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15490v1-abstract-full').style.display = 'inline'; document.getElementById('2406.15490v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.15490v1-abstract-full" style="display: none;"> This paper tackles the task of emotion-cause pair extraction in the unsupervised domain adaptation setting. The problem is challenging as the distributions of the events causing emotions in target domains are dramatically different than those in source domains, despite the distributions of emotional expressions between domains are overlapped. Inspired by causal discovery, we propose a novel deep latent model in the variational autoencoder (VAE) framework, which not only captures the underlying latent structures of data but also utilizes the easily transferable knowledge of emotions as the bridge to link the distributions of events in different domains. To facilitate knowledge transfer across domains, we also propose a novel variational posterior regularization technique to disentangle the latent representations of emotions from those of events in order to mitigate the damage caused by the spurious correlations related to the events in source domains. Through extensive experiments, we demonstrate that our model outperforms the strongest baseline by approximately 11.05% on a Chinese benchmark and 2.45% on a English benchmark in terms of weighted-average F1 score. The source code will be publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.15490v1-abstract-full').style.display = 'none'; document.getElementById('2406.15490v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures, 4 tables; Under Review in EMNLP 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12816">arXiv:2406.12816</a> <span> [<a href="https://arxiv.org/pdf/2406.12816">pdf</a>, <a href="https://arxiv.org/format/2406.12816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Neural Approximate Mirror Maps for Constrained Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Baptista%2C+R">Ricardo Baptista</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12816v1-abstract-short" style="display: inline;"> Diffusion models excel at creating visually-convincing images, but they often struggle to meet subtle constraints inherent in the training data. Such constraints could be physics-based (e.g., satisfying a PDE), geometric (e.g., respecting symmetry), or semantic (e.g., including a particular number of objects). When the training data all satisfy a certain constraint, enforcing this constraint on a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12816v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12816v1-abstract-full" style="display: none;"> Diffusion models excel at creating visually-convincing images, but they often struggle to meet subtle constraints inherent in the training data. Such constraints could be physics-based (e.g., satisfying a PDE), geometric (e.g., respecting symmetry), or semantic (e.g., including a particular number of objects). When the training data all satisfy a certain constraint, enforcing this constraint on a diffusion model not only improves its distribution-matching accuracy but also makes it more reliable for generating valid synthetic data and solving constrained inverse problems. However, existing methods for constrained diffusion models are inflexible with different types of constraints. Recent work proposed to learn mirror diffusion models (MDMs) in an unconstrained space defined by a mirror map and to impose the constraint with an inverse mirror map, but analytical mirror maps are challenging to derive for complex constraints. We propose neural approximate mirror maps (NAMMs) for general constraints. Our approach only requires a differentiable distance function from the constraint set. We learn an approximate mirror map that pushes data into an unconstrained space and a corresponding approximate inverse that maps data back to the constraint set. A generative model, such as an MDM, can then be trained in the learned mirror space and its samples restored to the constraint set by the inverse map. We validate our approach on a variety of constraints, showing that compared to an unconstrained diffusion model, a NAMM-based MDM substantially improves constraint satisfaction. We also demonstrate how existing diffusion-based inverse-problem solvers can be easily applied in the learned mirror space to solve constrained inverse problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12816v1-abstract-full').style.display = 'none'; document.getElementById('2406.12816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11683">arXiv:2406.11683</a> <span> [<a href="https://arxiv.org/pdf/2406.11683">pdf</a>, <a href="https://arxiv.org/format/2406.11683">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HoLLMwood: Unleashing the Creativity of Large Language Models in Screenwriting via Role Playing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chufan Shi</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Y">Yadong Xi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junjie Wang</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+J">Jiashu Pu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rongsheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tian Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11683v1-abstract-short" style="display: inline;"> Generative AI has demonstrated unprecedented creativity in the field of computer vision, yet such phenomena have not been observed in natural language processing. In particular, large language models (LLMs) can hardly produce written works at the level of human experts due to the extremely high complexity of literature writing. In this paper, we present HoLLMwood, an automated framework for unleas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11683v1-abstract-full').style.display = 'inline'; document.getElementById('2406.11683v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11683v1-abstract-full" style="display: none;"> Generative AI has demonstrated unprecedented creativity in the field of computer vision, yet such phenomena have not been observed in natural language processing. In particular, large language models (LLMs) can hardly produce written works at the level of human experts due to the extremely high complexity of literature writing. In this paper, we present HoLLMwood, an automated framework for unleashing the creativity of LLMs and exploring their potential in screenwriting, which is a highly demanding task. Mimicking the human creative process, we assign LLMs to different roles involved in the real-world scenario. In addition to the common practice of treating LLMs as ${Writer}$, we also apply LLMs as ${Editor}$, who is responsible for providing feedback and revision advice to ${Writer}$. Besides, to enrich the characters and deepen the plots, we introduce a role-playing mechanism and adopt LLMs as ${Actors}$ that can communicate and interact with each other. Evaluations on automatically generated screenplays show that HoLLMwood substantially outperforms strong baselines in terms of coherence, relevance, interestingness and overall quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11683v1-abstract-full').style.display = 'none'; document.getElementById('2406.11683v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.10318">arXiv:2406.10318</a> <span> [<a href="https://arxiv.org/pdf/2406.10318">pdf</a>, <a href="https://arxiv.org/format/2406.10318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Creating a Lens of Chinese Culture: A Multimodal Dataset for Chinese Pun Rebus Art Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yibin Ni</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Mengqin Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+R">Ruying Liu</a>, <a href="/search/cs?searchtype=author&query=Butler%2C+K">Katharine Butler</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+Y">Yanjun Weng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mi Zhang</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S+S">Shrikanth S. Narayanan</a>, <a href="/search/cs?searchtype=author&query=Avestimehr%2C+S">Salman Avestimehr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.10318v1-abstract-short" style="display: inline;"> Large vision-language models (VLMs) have demonstrated remarkable abilities in understanding everyday content. However, their performance in the domain of art, particularly culturally rich art forms, remains less explored. As a pearl of human wisdom and creativity, art encapsulates complex cultural narratives and symbolism. In this paper, we offer the Pun Rebus Art Dataset, a multimodal dataset for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10318v1-abstract-full').style.display = 'inline'; document.getElementById('2406.10318v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.10318v1-abstract-full" style="display: none;"> Large vision-language models (VLMs) have demonstrated remarkable abilities in understanding everyday content. However, their performance in the domain of art, particularly culturally rich art forms, remains less explored. As a pearl of human wisdom and creativity, art encapsulates complex cultural narratives and symbolism. In this paper, we offer the Pun Rebus Art Dataset, a multimodal dataset for art understanding deeply rooted in traditional Chinese culture. We focus on three primary tasks: identifying salient visual elements, matching elements with their symbolic meanings, and explanations for the conveyed messages. Our evaluation reveals that state-of-the-art VLMs struggle with these tasks, often providing biased and hallucinated explanations and showing limited improvement through in-context learning. By releasing the Pun Rebus Art Dataset, we aim to facilitate the development of VLMs that can better understand and interpret culturally specific content, promoting greater inclusiveness beyond English-based corpora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.10318v1-abstract-full').style.display = 'none'; document.getElementById('2406.10318v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08800">arXiv:2406.08800</a> <span> [<a href="https://arxiv.org/pdf/2406.08800">pdf</a>, <a href="https://arxiv.org/format/2406.08800">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Can Synthetic Audio From Generative Foundation Models Assist Audio Recognition and Speech Modeling? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Dimitriadis%2C+D">Dimitrios Dimitriadis</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08800v2-abstract-short" style="display: inline;"> Recent advances in foundation models have enabled audio-generative models that produce high-fidelity sounds associated with music, events, and human actions. Despite the success achieved in modern audio-generative models, the conventional approach to assessing the quality of the audio generation relies heavily on distance metrics like Frechet Audio Distance. In contrast, we aim to evaluate the qua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08800v2-abstract-full').style.display = 'inline'; document.getElementById('2406.08800v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08800v2-abstract-full" style="display: none;"> Recent advances in foundation models have enabled audio-generative models that produce high-fidelity sounds associated with music, events, and human actions. Despite the success achieved in modern audio-generative models, the conventional approach to assessing the quality of the audio generation relies heavily on distance metrics like Frechet Audio Distance. In contrast, we aim to evaluate the quality of audio generation by examining the effectiveness of using them as training data. Specifically, we conduct studies to explore the use of synthetic audio for audio recognition. Moreover, we investigate whether synthetic audio can serve as a resource for data augmentation in speech-related modeling. Our comprehensive experiments demonstrate the potential of using synthetic audio for audio recognition and speech-related modeling. Our code is available at https://github.com/usc-sail/SynthAudio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08800v2-abstract-full').style.display = 'none'; document.getElementById('2406.08800v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2024 INTERSPEECH; corrections to ActivityNet labels</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08644">arXiv:2406.08644</a> <span> [<a href="https://arxiv.org/pdf/2406.08644">pdf</a>, <a href="https://arxiv.org/format/2406.08644">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Toward Fully-End-to-End Listened Speech Decoding from EEG Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jihwan Lee</a>, <a href="/search/cs?searchtype=author&query=Kommineni%2C+A">Aditya Kommineni</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Avramidis%2C+K">Kleanthis Avramidis</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xuan Shi</a>, <a href="/search/cs?searchtype=author&query=Kadiri%2C+S">Sudarsana Kadiri</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08644v1-abstract-short" style="display: inline;"> Speech decoding from EEG signals is a challenging task, where brain activity is modeled to estimate salient characteristics of acoustic stimuli. We propose FESDE, a novel framework for Fully-End-to-end Speech Decoding from EEG signals. Our approach aims to directly reconstruct listened speech waveforms given EEG signals, where no intermediate acoustic feature processing step is required. The propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08644v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08644v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08644v1-abstract-full" style="display: none;"> Speech decoding from EEG signals is a challenging task, where brain activity is modeled to estimate salient characteristics of acoustic stimuli. We propose FESDE, a novel framework for Fully-End-to-end Speech Decoding from EEG signals. Our approach aims to directly reconstruct listened speech waveforms given EEG signals, where no intermediate acoustic feature processing step is required. The proposed method consists of an EEG module and a speech module along with a connector. The EEG module learns to better represent EEG signals, while the speech module generates speech waveforms from model representations. The connector learns to bridge the distributions of the latent spaces of EEG and speech. The proposed framework is both simple and efficient, by allowing single-step inference, and outperforms prior works on objective metrics. A fine-grained phoneme analysis is conducted to unveil model characteristics of speech decoding. The source code is available here: github.com/lee-jhwn/fesde. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08644v1-abstract-full').style.display = 'none'; document.getElementById('2406.08644v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to Interspeech2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07890">arXiv:2406.07890</a> <span> [<a href="https://arxiv.org/pdf/2406.07890">pdf</a>, <a href="https://arxiv.org/format/2406.07890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring Speech Foundation Models for Speaker Diarization in Child-Adult Dyadic Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+A">Anfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kevin Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+L">Lue Shen</a>, <a href="/search/cs?searchtype=author&query=Tager-Flusberg%2C+H">Helen Tager-Flusberg</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S">Shrikanth Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07890v1-abstract-short" style="display: inline;"> Speech foundation models, trained on vast datasets, have opened unique opportunities in addressing challenging low-resource speech understanding, such as child speech. In this work, we explore the capabilities of speech foundation models on child-adult speaker diarization. We show that exemplary foundation models can achieve 39.5% and 62.3% relative reductions in Diarization Error Rate and Speaker… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07890v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07890v1-abstract-full" style="display: none;"> Speech foundation models, trained on vast datasets, have opened unique opportunities in addressing challenging low-resource speech understanding, such as child speech. In this work, we explore the capabilities of speech foundation models on child-adult speaker diarization. We show that exemplary foundation models can achieve 39.5% and 62.3% relative reductions in Diarization Error Rate and Speaker Confusion Rate, respectively, compared to previous speaker diarization methods. In addition, we benchmark and evaluate the speaker diarization results of the speech foundation models with varying the input audio window size, speaker demographics, and training data ratio. Our results highlight promising pathways for understanding and adopting speech foundation models to facilitate child speech understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07890v1-abstract-full').style.display = 'none'; document.getElementById('2406.07890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02785">arXiv:2406.02785</a> <span> [<a href="https://arxiv.org/pdf/2406.02785">pdf</a>, <a href="https://arxiv.org/format/2406.02785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3847/1538-4357/ad737f">10.3847/1538-4357/ad737f <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Event-horizon-scale Imaging of M87* under Different Assumptions via Deep Generative Image Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a>, <a href="/search/cs?searchtype=author&query=Freeman%2C+W+T">William T. Freeman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02785v2-abstract-short" style="display: inline;"> Reconstructing images from the Event Horizon Telescope (EHT) observations of M87*, the supermassive black hole at the center of the galaxy M87, depends on a prior to impose desired image statistics. However, given the impossibility of directly observing black holes, there is no clear choice for a prior. We present a framework for flexibly designing a range of priors, each bringing different biases… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02785v2-abstract-full').style.display = 'inline'; document.getElementById('2406.02785v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02785v2-abstract-full" style="display: none;"> Reconstructing images from the Event Horizon Telescope (EHT) observations of M87*, the supermassive black hole at the center of the galaxy M87, depends on a prior to impose desired image statistics. However, given the impossibility of directly observing black holes, there is no clear choice for a prior. We present a framework for flexibly designing a range of priors, each bringing different biases to the image reconstruction. These priors can be weak (e.g., impose only basic natural-image statistics) or strong (e.g., impose assumptions of black-hole structure). Our framework uses Bayesian inference with score-based priors, which are data-driven priors arising from a deep generative model that can learn complicated image distributions. Using our Bayesian imaging approach with sophisticated data-driven priors, we can assess how visual features and uncertainty of reconstructed images change depending on the prior. In addition to simulated data, we image the real EHT M87* data and discuss how recovered features are influenced by the choice of prior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02785v2-abstract-full').style.display = 'none'; document.getElementById('2406.02785v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ApJ 975 201 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.02609">arXiv:2406.02609</a> <span> [<a href="https://arxiv.org/pdf/2406.02609">pdf</a>, <a href="https://arxiv.org/format/2406.02609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Less is More: Pseudo-Label Filtering for Continual Test-Time Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiayao Tan</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+F">Fan Lyu</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+C">Chenggong Ni</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tingliang Feng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+F">Fuyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shaochuang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Liang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.02609v2-abstract-short" style="display: inline;"> Continual Test-Time Adaptation (CTTA) aims to adapt a pre-trained model to a sequence of target domains during the test phase without accessing the source data. To adapt to unlabeled data from unknown domains, existing methods rely on constructing pseudo-labels for all samples and updating the model through self-training. However, these pseudo-labels often involve noise, leading to insufficient ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02609v2-abstract-full').style.display = 'inline'; document.getElementById('2406.02609v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.02609v2-abstract-full" style="display: none;"> Continual Test-Time Adaptation (CTTA) aims to adapt a pre-trained model to a sequence of target domains during the test phase without accessing the source data. To adapt to unlabeled data from unknown domains, existing methods rely on constructing pseudo-labels for all samples and updating the model through self-training. However, these pseudo-labels often involve noise, leading to insufficient adaptation. To improve the quality of pseudo-labels, we propose a pseudo-label selection method for CTTA, called Pseudo Labeling Filter (PLF). The key idea of PLF is to keep selecting appropriate thresholds for pseudo-labels and identify reliable ones for self-training. Specifically, we present three principles for setting thresholds during continuous domain learning, including initialization, growth and diversity. Based on these principles, we design Self-Adaptive Thresholding to filter pseudo-labels. Additionally, we introduce a Class Prior Alignment (CPA) method to encourage the model to make diverse predictions for unknown domain samples. Through extensive experiments, PLF outperforms current state-of-the-art methods, proving its effectiveness in CTTA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.02609v2-abstract-full').style.display = 'none'; document.getElementById('2406.02609v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2310.03335 by other authors</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10313">arXiv:2405.10313</a> <span> [<a href="https://arxiv.org/pdf/2405.10313">pdf</a>, <a href="https://arxiv.org/format/2405.10313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How Far Are We From AGI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chuanyang Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+K">Kunlun Zhu</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+H">Haoqin Tu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zirui Cheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+G">Guanyu Lin</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10313v1-abstract-short" style="display: inline;"> The evolution of artificial intelligence (AI) has profoundly impacted human society, driving significant advancements in multiple sectors. Yet, the escalating demands on AI have highlighted the limitations of AI's current offerings, catalyzing a movement towards Artificial General Intelligence (AGI). AGI, distinguished by its ability to execute diverse real-world tasks with efficiency and effectiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10313v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10313v1-abstract-full" style="display: none;"> The evolution of artificial intelligence (AI) has profoundly impacted human society, driving significant advancements in multiple sectors. Yet, the escalating demands on AI have highlighted the limitations of AI's current offerings, catalyzing a movement towards Artificial General Intelligence (AGI). AGI, distinguished by its ability to execute diverse real-world tasks with efficiency and effectiveness comparable to human intelligence, reflects a paramount milestone in AI evolution. While existing works have summarized specific recent advancements of AI, they lack a comprehensive discussion of AGI's definitions, goals, and developmental trajectories. Different from existing survey papers, this paper delves into the pivotal questions of our proximity to AGI and the strategies necessary for its realization through extensive surveys, discussions, and original perspectives. We start by articulating the requisite capability frameworks for AGI, integrating the internal, interface, and system dimensions. As the realization of AGI requires more advanced capabilities and adherence to stringent constraints, we further discuss necessary AGI alignment technologies to harmonize these factors. Notably, we emphasize the importance of approaching AGI responsibly by first defining the key levels of AGI progression, followed by the evaluation framework that situates the status-quo, and finally giving our roadmap of how to reach the pinnacle of AGI. Moreover, to give tangible insights into the ubiquitous impact of the integration of AI, we outline existing challenges and potential pathways toward AGI in multiple domains. In sum, serving as a pioneering exploration into the current state and future trajectory of AGI, this paper aims to foster a collective comprehension and catalyze broader public discussions among researchers and practitioners on AGI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10313v1-abstract-full').style.display = 'none'; document.getElementById('2405.10313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.08036">arXiv:2405.08036</a> <span> [<a href="https://arxiv.org/pdf/2405.08036">pdf</a>, <a href="https://arxiv.org/format/2405.08036">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> POWQMIX: Weighted Value Factorization with Potentially Optimal Joint Actions Recognition for Cooperative Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junqiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Shatong Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongtu Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chen Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Changjun Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.08036v2-abstract-short" style="display: inline;"> Value function factorization methods are commonly used in cooperative multi-agent reinforcement learning, with QMIX receiving significant attention. Many QMIX-based methods introduce monotonicity constraints between the joint action value and individual action values to achieve decentralized execution. However, such constraints limit the representation capacity of value factorization, restricting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08036v2-abstract-full').style.display = 'inline'; document.getElementById('2405.08036v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.08036v2-abstract-full" style="display: none;"> Value function factorization methods are commonly used in cooperative multi-agent reinforcement learning, with QMIX receiving significant attention. Many QMIX-based methods introduce monotonicity constraints between the joint action value and individual action values to achieve decentralized execution. However, such constraints limit the representation capacity of value factorization, restricting the joint action values it can represent and hindering the learning of the optimal policy. To address this challenge, we propose the Potentially Optimal joint actions Weighted QMIX (POWQMIX) algorithm, which recognizes the potentially optimal joint actions and assigns higher weights to the corresponding losses of these joint actions during training. We theoretically prove that with such a weighted training approach the optimal policy is guaranteed to be recovered. Experiments in matrix games, predator-prey, and StarCraft II Multi-Agent Challenge environments demonstrate that our algorithm outperforms the state-of-the-art value-based multi-agent reinforcement learning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.08036v2-abstract-full').style.display = 'none'; document.getElementById('2405.08036v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">change reference format</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02299">arXiv:2405.02299</a> <span> [<a href="https://arxiv.org/pdf/2405.02299">pdf</a>, <a href="https://arxiv.org/format/2405.02299">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Deep Reinforcement Learning for Modelling Protein Complexes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziqi Gao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Jiaxuan You</a>, <a href="/search/cs?searchtype=author&query=Zi%2C+C">Chenyi Zi</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02299v2-abstract-short" style="display: inline;"> AlphaFold can be used for both single-chain and multi-chain protein structure prediction, while the latter becomes extremely challenging as the number of chains increases. In this work, by taking each chain as a node and assembly actions as edges, we show that an acyclic undirected connected graph can be used to predict the structure of multi-chain protein complexes (a.k.a., protein complex modell… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02299v2-abstract-full').style.display = 'inline'; document.getElementById('2405.02299v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02299v2-abstract-full" style="display: none;"> AlphaFold can be used for both single-chain and multi-chain protein structure prediction, while the latter becomes extremely challenging as the number of chains increases. In this work, by taking each chain as a node and assembly actions as edges, we show that an acyclic undirected connected graph can be used to predict the structure of multi-chain protein complexes (a.k.a., protein complex modelling, PCM). However, there are still two challenges: 1) The huge combinatorial optimization space of $N^{N-2}$ ($N$ is the number of chains) for the PCM problem can easily lead to high computational cost. 2) The scales of protein complexes exhibit distribution shift due to variance in chain numbers, which calls for the generalization in modelling complexes of various scales. To address these challenges, we propose GAPN, a Generative Adversarial Policy Network powered by domain-specific rewards and adversarial loss through policy gradient for automatic PCM prediction. Specifically, GAPN learns to efficiently search through the immense assembly space and optimize the direct docking reward through policy gradient. Importantly, we design an adversarial reward function to enhance the receptive field of our model. In this way, GAPN will simultaneously focus on a specific batch of complexes and the global assembly rules learned from complexes with varied chain numbers. Empirically, we have achieved both significant accuracy (measured by RMSD and TM-Score) and efficiency improvements compared to leading PCM softwares. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02299v2-abstract-full').style.display = 'none'; document.getElementById('2405.02299v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">International Conference on Learning Representations (ICLR 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01316">arXiv:2405.01316</a> <span> [<a href="https://arxiv.org/pdf/2405.01316">pdf</a>, <a href="https://arxiv.org/format/2405.01316">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> LOG-LIO2: A LiDAR-Inertial Odometry with Efficient Uncertainty Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kai Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junqiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiaye Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhongyang Zhu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shuangfu Song</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chen Ye</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01316v2-abstract-short" style="display: inline;"> Uncertainty in LiDAR measurements, stemming from factors such as range sensing, is crucial for LIO (LiDAR-Inertial Odometry) systems as it affects the accurate weighting in the loss function. While recent LIO systems address uncertainty related to range sensing, the impact of incident angle on uncertainty is often overlooked by the community. Moreover, the existing uncertainty propagation methods… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01316v2-abstract-full').style.display = 'inline'; document.getElementById('2405.01316v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01316v2-abstract-full" style="display: none;"> Uncertainty in LiDAR measurements, stemming from factors such as range sensing, is crucial for LIO (LiDAR-Inertial Odometry) systems as it affects the accurate weighting in the loss function. While recent LIO systems address uncertainty related to range sensing, the impact of incident angle on uncertainty is often overlooked by the community. Moreover, the existing uncertainty propagation methods suffer from computational inefficiency. This paper proposes a comprehensive point uncertainty model that accounts for both the uncertainties from LiDAR measurements and surface characteristics, along with an efficient local uncertainty analytical method for LiDAR-based state estimation problem. We employ a projection operator that separates the uncertainty into the ray direction and its orthogonal plane. Then, we derive incremental Jacobian matrices of eigenvalues and eigenvectors w.r.t. points, which enables a fast approximation of uncertainty propagation. This approach eliminates the requirement for redundant traversal of points, significantly reducing the time complexity of uncertainty propagation from $\mathcal{O} (n)$ to $\mathcal{O} (1)$ when a new point is added. Simulations and experiments on public datasets are conducted to validate the accuracy and efficiency of our formulations. The proposed methods have been integrated into a LIO system, which is available at https://github.com/tiev-tongji/LOG-LIO2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01316v2-abstract-full').style.display = 'none'; document.getElementById('2405.01316v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.17983">arXiv:2404.17983</a> <span> [<a href="https://arxiv.org/pdf/2404.17983">pdf</a>, <a href="https://arxiv.org/format/2404.17983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> TI-ASU: Toward Robust Automatic Speech Understanding through Text-to-speech Imputation Against Missing Speech Modality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tiantian Feng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+X">Xuan Shi</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+R">Rahul Gupta</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+S+S">Shrikanth S. Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.17983v1-abstract-short" style="display: inline;"> Automatic Speech Understanding (ASU) aims at human-like speech interpretation, providing nuanced intent, emotion, sentiment, and content understanding from speech and language (text) content conveyed in speech. Typically, training a robust ASU model relies heavily on acquiring large-scale, high-quality speech and associated transcriptions. However, it is often challenging to collect or use speech… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17983v1-abstract-full').style.display = 'inline'; document.getElementById('2404.17983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.17983v1-abstract-full" style="display: none;"> Automatic Speech Understanding (ASU) aims at human-like speech interpretation, providing nuanced intent, emotion, sentiment, and content understanding from speech and language (text) content conveyed in speech. Typically, training a robust ASU model relies heavily on acquiring large-scale, high-quality speech and associated transcriptions. However, it is often challenging to collect or use speech data for training ASU due to concerns such as privacy. To approach this setting of enabling ASU when speech (audio) modality is missing, we propose TI-ASU, using a pre-trained text-to-speech model to impute the missing speech. We report extensive experiments evaluating TI-ASU on various missing scales, both multi- and single-modality settings, and the use of LLMs. Our findings show that TI-ASU yields substantial benefits to improve ASU in scenarios where even up to 95% of training speech is missing. Moreover, we show that TI-ASU is adaptive to dropout training, improving model robustness in addressing missing speech during inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.17983v1-abstract-full').style.display = 'none'; document.getElementById('2404.17983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14829">arXiv:2404.14829</a> <span> [<a href="https://arxiv.org/pdf/2404.14829">pdf</a>, <a href="https://arxiv.org/format/2404.14829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Neural Networks for Continual Learning: An Architectural Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+A">Aojun Lu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hangjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiaotian Song</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14829v3-abstract-short" style="display: inline;"> Efforts to overcome catastrophic forgetting have primarily centered around developing more effective Continual Learning (CL) methods. In contrast, less attention was devoted to analyzing the role of network architecture design (e.g., network depth, width, and components) in contributing to CL. This paper seeks to bridge this gap between network architecture design and CL, and to present a holistic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14829v3-abstract-full').style.display = 'inline'; document.getElementById('2404.14829v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14829v3-abstract-full" style="display: none;"> Efforts to overcome catastrophic forgetting have primarily centered around developing more effective Continual Learning (CL) methods. In contrast, less attention was devoted to analyzing the role of network architecture design (e.g., network depth, width, and components) in contributing to CL. This paper seeks to bridge this gap between network architecture design and CL, and to present a holistic study on the impact of network architectures on CL. This work considers architecture design at the network scaling level, i.e., width and depth, and also at the network components, i.e., skip connections, global pooling layers, and down-sampling. In both cases, we first derive insights through systematically exploring how architectural designs affect CL. Then, grounded in these insights, we craft a specialized search space for CL and further propose a simple yet effective ArchCraft method to steer a CL-friendly architecture, namely, this method recrafts AlexNet/ResNet into AlexAC/ResAC. Experimental validation across various CL settings and scenarios demonstrates that improved architectures are parameter-efficient, achieving state-of-the-art performance of CL while being 86%, 61%, and 97% more compact in terms of parameters than the naive CL architecture in Task IL and Class IL. Code is available at https://github.com/byyx666/ArchCraft. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14829v3-abstract-full').style.display = 'none'; document.getElementById('2404.14829v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13504">arXiv:2404.13504</a> <span> [<a href="https://arxiv.org/pdf/2404.13504">pdf</a>, <a href="https://arxiv.org/format/2404.13504">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> IMO: Greedy Layer-Wise Sparse Representation Learning for Out-of-Distribution Text Classification with Pre-trained Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+L">Lizhen Qu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuang Li</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+H">Haolan Zhan</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+Y">Yuncheng Hua</a>, <a href="/search/cs?searchtype=author&query=Haffari%2C+G">Gholamreza Haffari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13504v1-abstract-short" style="display: inline;"> Machine learning models have made incredible progress, but they still struggle when applied to examples from unseen domains. This study focuses on a specific problem of domain generalization, where a model is trained on one source domain and tested on multiple target domains that are unseen during training. We propose IMO: Invariant features Masks for Out-of-Distribution text classification, to ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13504v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13504v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13504v1-abstract-full" style="display: none;"> Machine learning models have made incredible progress, but they still struggle when applied to examples from unseen domains. This study focuses on a specific problem of domain generalization, where a model is trained on one source domain and tested on multiple target domains that are unseen during training. We propose IMO: Invariant features Masks for Out-of-Distribution text classification, to achieve OOD generalization by learning invariant features. During training, IMO would learn sparse mask layers to remove irrelevant features for prediction, where the remaining features keep invariant. Additionally, IMO has an attention module at the token level to focus on tokens that are useful for prediction. Our comprehensive experiments show that IMO substantially outperforms strong baselines in terms of various evaluation metrics and settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13504v1-abstract-full').style.display = 'none'; document.getElementById('2404.13504v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09385">arXiv:2404.09385</a> <span> [<a href="https://arxiv.org/pdf/2404.09385">pdf</a>, <a href="https://arxiv.org/format/2404.09385">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Large-Scale Evaluation of Speech Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Heng-Jui Chang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zili Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+C">Cheng-I Lai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haibin Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xuankai Chang</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+H">Hsiang-Sheng Tsai</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wen-Chin Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tzu-hsun Feng</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+P">Po-Han Chi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y+Y">Yist Y. Lin</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+Y">Yung-Sung Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tzu-Hsien Huang</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Lakhotia%2C+K">Kushal Lakhotia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/cs?searchtype=author&query=Mohamed%2C+A">Abdelrahman Mohamed</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+S">Shinji Watanabe</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09385v2-abstract-short" style="display: inline;"> The foundation model paradigm leverages a shared foundation model to achieve state-of-the-art (SOTA) performance for various tasks, requiring minimal downstream-specific modeling and data annotation. This approach has proven crucial in the field of Natural Language Processing (NLP). However, the speech processing community lacks a similar setup to explore the paradigm systematically. In this work,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09385v2-abstract-full').style.display = 'inline'; document.getElementById('2404.09385v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09385v2-abstract-full" style="display: none;"> The foundation model paradigm leverages a shared foundation model to achieve state-of-the-art (SOTA) performance for various tasks, requiring minimal downstream-specific modeling and data annotation. This approach has proven crucial in the field of Natural Language Processing (NLP). However, the speech processing community lacks a similar setup to explore the paradigm systematically. In this work, we establish the Speech processing Universal PERformance Benchmark (SUPERB) to study the effectiveness of the paradigm for speech. We propose a unified multi-tasking framework to address speech processing tasks in SUPERB using a frozen foundation model followed by task-specialized, lightweight prediction heads. Combining our results with community submissions, we verify that the foundation model paradigm is promising for speech, and our multi-tasking framework is simple yet effective, as the best-performing foundation model shows competitive generalizability across most SUPERB tasks. For reproducibility and extensibility, we have developed a long-term maintained platform that enables deterministic benchmarking, allows for result sharing via an online leaderboard, and promotes collaboration through a community-driven benchmark database to support new development cycles. Finally, we conduct a series of analyses to offer an in-depth understanding of SUPERB and speech foundation models, including information flows across tasks inside the models, the correctness of the weighted-sum benchmarking protocol and the statistical significance and robustness of the benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09385v2-abstract-full').style.display = 'none'; document.getElementById('2404.09385v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The extended journal version for SUPERB and SUPERB-SG. Published in IEEE/ACM TASLP. The Arxiv version is preferred</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.05272">arXiv:2404.05272</a> <span> [<a href="https://arxiv.org/pdf/2404.05272">pdf</a>, <a href="https://arxiv.org/format/2404.05272">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Constructing Data Transaction Chains Based on Opportunity Cost Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peizheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chao Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.05272v1-abstract-short" style="display: inline;"> Data trading is increasingly gaining attention. However, the inherent replicability and privacy concerns of data make it challenging to directly apply traditional trading theories to data markets. This paper compares data trading markets with traditional ones, focusing particularly on how the replicability and privacy of data impact data markets. We discuss how data's replicability fundamentally a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05272v1-abstract-full').style.display = 'inline'; document.getElementById('2404.05272v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.05272v1-abstract-full" style="display: none;"> Data trading is increasingly gaining attention. However, the inherent replicability and privacy concerns of data make it challenging to directly apply traditional trading theories to data markets. This paper compares data trading markets with traditional ones, focusing particularly on how the replicability and privacy of data impact data markets. We discuss how data's replicability fundamentally alters the concept of opportunity cost in traditional microeconomics within the context of data markets. Additionally, we explore how to leverage this change to maximize benefits without compromising data privacy. This paper outlines the constraints for data circulation within the privacy domain chain and presents a model that maximizes data's value under these constraints. Specific application scenarios are provided, and experiments demonstrate the solvability of this model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.05272v1-abstract-full').style.display = 'none'; document.getElementById('2404.05272v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00986">arXiv:2404.00986</a> <span> [<a href="https://arxiv.org/pdf/2404.00986">pdf</a>, <a href="https://arxiv.org/format/2404.00986">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Make Continual Learning Stronger via C-Flat </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bian%2C+A">Ang Bian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hangjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chengrong Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zixiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+A">Aojun Lu</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+P">Pengliang Ji</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tao Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00986v2-abstract-short" style="display: inline;"> Model generalization ability upon incrementally acquiring dynamically updating knowledge from sequentially arriving tasks is crucial to tackle the sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape sharpness minimization seeking for flat minima lying in neighborhoods with uniform low loss or smooth gradient is proven to be a strong training regime improving model gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00986v2-abstract-full').style.display = 'inline'; document.getElementById('2404.00986v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00986v2-abstract-full" style="display: none;"> Model generalization ability upon incrementally acquiring dynamically updating knowledge from sequentially arriving tasks is crucial to tackle the sensitivity-stability dilemma in Continual Learning (CL). Weight loss landscape sharpness minimization seeking for flat minima lying in neighborhoods with uniform low loss or smooth gradient is proven to be a strong training regime improving model generalization compared with loss minimization based optimizer like SGD. Yet only a few works have discussed this training regime for CL, proving that dedicated designed zeroth-order sharpness optimizer can improve CL performance. In this work, we propose a Continual Flatness (C-Flat) method featuring a flatter loss landscape tailored for CL. C-Flat could be easily called with only one line of code and is plug-and-play to any CL methods. A general framework of C-Flat applied to all CL categories and a thorough comparison with loss minima optimizer and flat minima based CL approaches is presented in this paper, showing that our method can boost CL performance in almost all cases. Code is available at https://github.com/WanNaa/C-Flat. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00986v2-abstract-full').style.display = 'none'; document.getElementById('2404.00986v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00471">arXiv:2404.00471</a> <span> [<a href="https://arxiv.org/pdf/2404.00471">pdf</a>, <a href="https://arxiv.org/format/2404.00471">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP48485.2024.10447579">10.1109/ICASSP48485.2024.10447579 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Score-Based Diffusion Models for Photoacoustic Tomography Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dey%2C+S">Sreemanti Dey</a>, <a href="/search/cs?searchtype=author&query=Saha%2C+S">Snigdha Saha</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B+T">Berthy T. Feng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+M">Manxiu Cui</a>, <a href="/search/cs?searchtype=author&query=Delisle%2C+L">Laure Delisle</a>, <a href="/search/cs?searchtype=author&query=Leong%2C+O">Oscar Leong</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L+V">Lihong V. Wang</a>, <a href="/search/cs?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00471v1-abstract-short" style="display: inline;"> Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality that combines optical absorption contrast with ultrasound imaging depth. One challenge in PAT is image reconstruction with inadequate acoustic signals due to limited sensor coverage or due to the density of the transducer array. Such cases call for solving an ill-posed inverse reconstruction problem. In this work, we use… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00471v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00471v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00471v1-abstract-full" style="display: none;"> Photoacoustic tomography (PAT) is a rapidly-evolving medical imaging modality that combines optical absorption contrast with ultrasound imaging depth. One challenge in PAT is image reconstruction with inadequate acoustic signals due to limited sensor coverage or due to the density of the transducer array. Such cases call for solving an ill-posed inverse reconstruction problem. In this work, we use score-based diffusion models to solve the inverse problem of reconstructing an image from limited PAT measurements. The proposed approach allows us to incorporate an expressive prior learned by a diffusion model on simulated vessel structures while still being robust to varying transducer sparsity conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00471v1-abstract-full').style.display = 'none'; document.getElementById('2404.00471v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Seoul, Korea, Republic of, 2024, pp. 2470-2474 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15173">arXiv:2403.15173</a> <span> [<a href="https://arxiv.org/pdf/2403.15173">pdf</a>, <a href="https://arxiv.org/format/2403.15173">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LSK3DNet: Towards Effective and Efficient 3D Perception with Large Sparse Kernels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tuo Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenguan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fan Ma</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15173v1-abstract-short" style="display: inline;"> Autonomous systems need to process large-scale, sparse, and irregular point clouds with limited compute resources. Consequently, it is essential to develop LiDAR perception methods that are both efficient and effective. Although naively enlarging 3D kernel size can enhance performance, it will also lead to a cubically-increasing overhead. Therefore, it is crucial to develop streamlined 3D large ke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15173v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15173v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15173v1-abstract-full" style="display: none;"> Autonomous systems need to process large-scale, sparse, and irregular point clouds with limited compute resources. Consequently, it is essential to develop LiDAR perception methods that are both efficient and effective. Although naively enlarging 3D kernel size can enhance performance, it will also lead to a cubically-increasing overhead. Therefore, it is crucial to develop streamlined 3D large kernel designs that eliminate redundant weights and work effectively with larger kernels. In this paper, we propose an efficient and effective Large Sparse Kernel 3D Neural Network (LSK3DNet) that leverages dynamic pruning to amplify the 3D kernel size. Our method comprises two core components: Spatial-wise Dynamic Sparsity (SDS) and Channel-wise Weight Selection (CWS). SDS dynamically prunes and regrows volumetric weights from the beginning to learn a large sparse 3D kernel. It not only boosts performance but also significantly reduces model size and computational cost. Moreover, CWS selects the most important channels for 3D convolution during training and subsequently prunes the redundant channels to accelerate inference for 3D vision tasks. We demonstrate the effectiveness of LSK3DNet on three benchmark datasets and five tracks compared with classical models and large kernel designs. Notably, LSK3DNet achieves the state-of-the-art performance on SemanticKITTI (i.e., 75.6% on single-scan and 63.4% on multi-scan), with roughly 40% model size reduction and 60% computing operations reduction compared to the naive large 3D kernel model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15173v1-abstract-full').style.display = 'none'; document.getElementById('2403.15173v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR 2024; Project page: https://github.com/FengZicai/LSK3DNet</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06302">arXiv:2403.06302</a> <span> [<a href="https://arxiv.org/pdf/2403.06302">pdf</a>, <a href="https://arxiv.org/format/2403.06302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Nonparametric Automatic Differentiation Variational Inference with Spline Approximation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuda Shao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shan Yu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tianshu Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06302v1-abstract-short" style="display: inline;"> Automatic Differentiation Variational Inference (ADVI) is efficient in learning probabilistic models. Classic ADVI relies on the parametric approach to approximate the posterior. In this paper, we develop a spline-based nonparametric approximation approach that enables flexible posterior approximation for distributions with complicated structures, such as skewness, multimodality, and bounded suppo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06302v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06302v1-abstract-full" style="display: none;"> Automatic Differentiation Variational Inference (ADVI) is efficient in learning probabilistic models. Classic ADVI relies on the parametric approach to approximate the posterior. In this paper, we develop a spline-based nonparametric approximation approach that enables flexible posterior approximation for distributions with complicated structures, such as skewness, multimodality, and bounded support. Compared with widely-used nonparametric variational inference methods, the proposed method is easy to implement and adaptive to various data structures. By adopting the spline approximation, we derive a lower bound of the importance weighted autoencoder and establish the asymptotic consistency. Experiments demonstrate the efficiency of the proposed method in approximating complex posterior distributions and improving the performance of generative models with incomplete data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06302v1-abstract-full').style.display = 'none'; document.getElementById('2403.06302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Feng%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Feng%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>