CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 284 results for author: <span class="mathjax">Meng, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Meng%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Meng, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Meng%2C+H&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Meng, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06075">arXiv:2502.06075</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.06075">pdf</a>, <a href="https://arxiv.org/format/2502.06075">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Deconstructing Depression Stigma: Integrating AI-driven Data Collection and Analysis with Causal Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Han Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Renwen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+G">Ganyi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yitian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+P">Peinuan Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+J">Jungup Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Yi-Chieh Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06075v1-abstract-short" style="display: inline;"> Mental-illness stigma is a persistent social problem, hampering both treatment-seeking and recovery. Accordingly, there is a pressing need to understand it more clearly, but analyzing the relevant data is highly labor-intensive. Therefore, we designed a chatbot to engage participants in conversations; coded those conversations qualitatively with AI assistance; and, based on those coding results, b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06075v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06075v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06075v1-abstract-full" style="display: none;"> Mental-illness stigma is a persistent social problem, hampering both treatment-seeking and recovery. Accordingly, there is a pressing need to understand it more clearly, but analyzing the relevant data is highly labor-intensive. Therefore, we designed a chatbot to engage participants in conversations; coded those conversations qualitatively with AI assistance; and, based on those coding results, built causal knowledge graphs to decode stigma. The results we obtained from 1,002 participants demonstrate that conversation with our chatbot can elicit rich information about people&#39;s attitudes toward depression, while our AI-assisted coding was strongly consistent with human-expert coding. Our novel approach combining large language models (LLMs) and causal knowledge graphs uncovered patterns in individual responses and illustrated the interrelationships of psychological constructs in the dataset as a whole. The paper also discusses these findings&#39; implications for HCI researchers in developing digital interventions, decomposing human psychological constructs, and fostering inclusive attitudes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06075v1-abstract-full').style.display = 'none'; document.getElementById('2502.06075v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Conditionally accepted to CHI Conference on Human Factors in Computing Systems (CHI&#39;25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05562">arXiv:2502.05562</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.05562">pdf</a>, <a href="https://arxiv.org/format/2502.05562">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Be Query Optimizer for Relational Databases? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+K">Kangfei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J+X">Jeffrey Xu Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Piao%2C+C">Chengzhi Piao</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Rong%2C+Y">Yu Rong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05562v1-abstract-short" style="display: inline;"> Query optimization, which finds the optimized execution plan for a given query, is a complex planning and decision-making problem within the exponentially growing plan space in database management systems (DBMS). Traditional optimizers heavily rely on a certain cost model constructed by various heuristics and empirical tuning, probably leading to generating suboptimal plans. Recent developments of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05562v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05562v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05562v1-abstract-full" style="display: none;"> Query optimization, which finds the optimized execution plan for a given query, is a complex planning and decision-making problem within the exponentially growing plan space in database management systems (DBMS). Traditional optimizers heavily rely on a certain cost model constructed by various heuristics and empirical tuning, probably leading to generating suboptimal plans. Recent developments of Large Language Models (LLMs) have demonstrated their potential in solving complex planning and decision-making problems, such as arithmetic and programmatic tasks. In this paper, we try to explore the potential of LLMs in handling query optimization and propose a tentative LLM-based query optimizer dubbed LLM-QO, established on PostgreSQL&#39;s execution engine. In LLM-QO, we formulate query optimization in an autoregressive fashion which directly generates the execution plan without explicit plan enumeration. To investigate the essential input of LLM-QO, we design a customized data recipe named QInstruct to collect the training data from various optimizers and serialize the database&#39;s meta data, queries and corresponding plans into a textual format. Based on QInstruct, we implement a two-stage fine-tuning pipeline, Query Instruction Tuning (QIT) and Query Direct Preference Optimization (QDPO), to empower the capability of general-purpose LLMs in handling query optimization. In our experiments, LLM-QO can generate valid and high-quality plans and consistently outperforms both traditional and learned optimizers on three query workloads. Our findings verify that LLMs can be derived as query optimizers where generalization, efficiency and adaptivity deserve further research efforts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05562v1-abstract-full').style.display = 'none'; document.getElementById('2502.05562v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03885">arXiv:2502.03885</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2502.03885">pdf</a>, <a href="https://arxiv.org/format/2502.03885">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InfinitePOD: Building Datacenter-Scale High-Bandwidth Domain for LLM with Optical Circuit Switching Transceivers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shou%2C+C">Chenchen Shou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+G">Guyue Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+H">Hao Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Huaiyu Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yimin Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lv%2C+W">Wenqing Lv</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Yelong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yuanwei Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+Y">Yanbo Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yichen Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yibo Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+D">Daxin Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03885v2-abstract-short" style="display: inline;"> Scaling Large Language Model (LLM) training relies on multi-dimensional parallelism, where High-Bandwidth Domains (HBDs) are critical for communication-intensive parallelism like Tensor Parallelism (TP) and Expert Parallelism (EP). However, existing HBD architectures face fundamental limitations in scalability, cost, and fault resiliency: switch-centric HBDs (e.g., NVL-72) incur prohibitive scalin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03885v2-abstract-full').style.display = 'inline'; document.getElementById('2502.03885v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03885v2-abstract-full" style="display: none;"> Scaling Large Language Model (LLM) training relies on multi-dimensional parallelism, where High-Bandwidth Domains (HBDs) are critical for communication-intensive parallelism like Tensor Parallelism (TP) and Expert Parallelism (EP). However, existing HBD architectures face fundamental limitations in scalability, cost, and fault resiliency: switch-centric HBDs (e.g., NVL-72) incur prohibitive scaling costs, while GPU-centric HBDs (e.g., TPUv3/Dojo) suffer from severe fault propagation. Switch-GPU hybrid HBDs such as TPUv4 takes a middle-ground approach by leveraging Optical Circuit Switches, but the fault explosion radius remains large at the cube level (e.g., 64 TPUs). We propose InfinitePOD, a novel transceiver-centric HBD architecture that unifies connectivity and dynamic switching at the transceiver level using Optical Circuit Switching (OCS). By embedding OCS within each transceiver, InfinitePOD achieves reconfigurable point-to-multipoint connectivity, allowing the topology to adapt into variable-size rings. This design provides: i) datacenter-wide scalability without cost explosion; ii) fault resilience by isolating failures to a single node, and iii) full bandwidth utilization for fault-free GPUs. Key innovations include a Silicon Photonic (SiPh) based low-cost OCS transceiver (OCSTrx), a reconfigurable k-hop ring topology co-designed with intra-/inter-node communication, and an HBD-DCN orchestration algorithm maximizing GPU utilization while minimizing cross-ToR datacenter network traffic. The evaluation demonstrates that InfinitePOD achieves 31% of the cost of NVL-72, near-zero GPU waste ratio (over one order of magnitude lower than NVL-72 and TPUv4), near-zero cross-ToR traffic when node fault ratios under 7%, and improves Model FLOPs Utilization by 3.37x compared to NVIDIA DGX (8 GPUs per Node). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03885v2-abstract-full').style.display = 'none'; document.getElementById('2502.03885v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12536">arXiv:2501.12536</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12536">pdf</a>, <a href="https://arxiv.org/format/2501.12536">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Interaction Dataset of Autonomous Vehicles with Traffic Lights and Signs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+Z">Zhipeng Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haoming Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+H">Haotian Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qianwen Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+H">Handong Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaopeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12536v1-abstract-short" style="display: inline;"> This paper presents the development of a comprehensive dataset capturing interactions between Autonomous Vehicles (AVs) and traffic control devices, specifically traffic lights and stop signs. Derived from the Waymo Motion dataset, our work addresses a critical gap in the existing literature by providing real-world trajectory data on how AVs navigate these traffic control devices. We propose a met&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12536v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12536v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12536v1-abstract-full" style="display: none;"> This paper presents the development of a comprehensive dataset capturing interactions between Autonomous Vehicles (AVs) and traffic control devices, specifically traffic lights and stop signs. Derived from the Waymo Motion dataset, our work addresses a critical gap in the existing literature by providing real-world trajectory data on how AVs navigate these traffic control devices. We propose a methodology for identifying and extracting relevant interaction trajectory data from the Waymo Motion dataset, incorporating over 37,000 instances with traffic lights and 44,000 with stop signs. Our methodology includes defining rules to identify various interaction types, extracting trajectory data, and applying a wavelet-based denoising method to smooth the acceleration and speed profiles and eliminate anomalous values, thereby enhancing the trajectory quality. Quality assessment metrics indicate that trajectories obtained in this study have anomaly proportions in acceleration and jerk profiles reduced to near-zero levels across all interaction categories. By making this dataset publicly available, we aim to address the current gap in datasets containing AV interaction behaviors with traffic lights and signs. Based on the organized and published dataset, we can gain a more in-depth understanding of AVs&#39; behavior when interacting with traffic lights and signs. This will facilitate research on AV integration into existing transportation infrastructures and networks, supporting the development of more accurate behavioral models and simulation tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12536v1-abstract-full').style.display = 'none'; document.getElementById('2501.12536v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12246">arXiv:2501.12246</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.12246">pdf</a>, <a href="https://arxiv.org/format/2501.12246">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Video Deblurring by Sharpness Prior Detection and Edge Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yang Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Brau%2C+F">Fabio Brau</a>, <a href="/search/cs?searchtype=author&amp;query=Rossolini%2C+G">Giulio Rossolini</a>, <a href="/search/cs?searchtype=author&amp;query=Buttazzo%2C+G">Giorgio Buttazzo</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hao Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12246v1-abstract-short" style="display: inline;"> Video deblurring is essential task for autonomous driving, facial recognition, and security surveillance. Traditional methods directly estimate motion blur kernels, often introducing artifacts and leading to poor results. Recent approaches utilize the detection of sharp frames within video sequences to enhance deblurring. However, existing datasets rely on fixed number of sharp frames, which may b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12246v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12246v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12246v1-abstract-full" style="display: none;"> Video deblurring is essential task for autonomous driving, facial recognition, and security surveillance. Traditional methods directly estimate motion blur kernels, often introducing artifacts and leading to poor results. Recent approaches utilize the detection of sharp frames within video sequences to enhance deblurring. However, existing datasets rely on fixed number of sharp frames, which may be too restrictive for some applications and may introduce a bias during model training. To address these limitations and enhance domain adaptability, this work first introduces GoPro Random Sharp (GoProRS), a new dataset where the the frequency of sharp frames within the sequence is customizable, allowing more diverse training and testing scenarios. Furthermore, it presents a novel video deblurring model, called SPEINet, that integrates sharp frame features into blurry frame reconstruction through an attention-based encoder-decoder architecture, a lightweight yet robust sharp frame detection and an edge extraction phase. Extensive experimental results demonstrate that SPEINet outperforms state-of-the-art methods across multiple datasets, achieving an average of +3.2% PSNR improvement over recent techniques. Given such promising results, we believe that both the proposed model and dataset pave the way for future advancements in video deblurring based on the detection of sharp frames. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12246v1-abstract-full').style.display = 'none'; document.getElementById('2501.12246v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review in Pattern Recognition</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10625">arXiv:2501.10625</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.10625">pdf</a>, <a href="https://arxiv.org/format/2501.10625">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Assessing Markov Property in Driving Behaviors: Insights from Statistical Tests </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haoming Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+C">Chengyuan Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+K">Ke Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiaopeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10625v1-abstract-short" style="display: inline;"> The Markov property serves as a foundational assumption in most existing work on vehicle driving behavior, positing that future states depend solely on the current state, not the series of preceding states. This study validates the Markov properties of vehicle trajectories for both Autonomous Vehicles (AVs) and Human-driven Vehicles (HVs). A statistical method used to test whether time series data&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10625v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10625v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10625v1-abstract-full" style="display: none;"> The Markov property serves as a foundational assumption in most existing work on vehicle driving behavior, positing that future states depend solely on the current state, not the series of preceding states. This study validates the Markov properties of vehicle trajectories for both Autonomous Vehicles (AVs) and Human-driven Vehicles (HVs). A statistical method used to test whether time series data exhibits Markov properties is applied to examine whether the trajectory data possesses Markov characteristics. t test and F test are additionally introduced to characterize the differences in Markov properties between AVs and HVs. Based on two public trajectory datasets, we investigate the presence and order of the Markov property of different types of vehicles through rigorous statistical tests. Our findings reveal that AV trajectories generally exhibit stronger Markov properties compared to HV trajectories, with a higher percentage conforming to the Markov property and lower Markov orders. In contrast, HV trajectories display greater variability and heterogeneity in decision-making processes, reflecting the complex perception and information processing involved in human driving. These results have significant implications for the development of driving behavior models, AV controllers, and traffic simulation systems. Our study also demonstrates the feasibility of using statistical methods to test the presence of Markov properties in driving trajectory data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10625v1-abstract-full').style.display = 'none'; document.getElementById('2501.10625v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08165">arXiv:2501.08165</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.08165">pdf</a>, <a href="https://arxiv.org/format/2501.08165">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> I Can Find You in Seconds! Leveraging Large Language Models for Code Authorship Attribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Choi%2C+S">Soohyeon Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Y+K">Yong Kiam Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+M+H">Mark Huasong Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Ragab%2C+M">Mohamed Ragab</a>, <a href="/search/cs?searchtype=author&amp;query=Mondal%2C+S">Soumik Mondal</a>, <a href="/search/cs?searchtype=author&amp;query=Mohaisen%2C+D">David Mohaisen</a>, <a href="/search/cs?searchtype=author&amp;query=Aung%2C+K+M+M">Khin Mi Mi Aung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08165v1-abstract-short" style="display: inline;"> Source code authorship attribution is important in software forensics, plagiarism detection, and protecting software patch integrity. Existing techniques often rely on supervised machine learning, which struggles with generalization across different programming languages and coding styles due to the need for large labeled datasets. Inspired by recent advances in natural language authorship analysi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08165v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08165v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08165v1-abstract-full" style="display: none;"> Source code authorship attribution is important in software forensics, plagiarism detection, and protecting software patch integrity. Existing techniques often rely on supervised machine learning, which struggles with generalization across different programming languages and coding styles due to the need for large labeled datasets. Inspired by recent advances in natural language authorship analysis using large language models (LLMs), which have shown exceptional performance without task-specific tuning, this paper explores the use of LLMs for source code authorship attribution. We present a comprehensive study demonstrating that state-of-the-art LLMs can successfully attribute source code authorship across different languages. LLMs can determine whether two code snippets are written by the same author with zero-shot prompting, achieving a Matthews Correlation Coefficient (MCC) of 0.78, and can attribute code authorship from a small set of reference code snippets via few-shot learning, achieving MCC of 0.77. Additionally, LLMs show some adversarial robustness against misattribution attacks. Despite these capabilities, we found that naive prompting of LLMs does not scale well with a large number of authors due to input token limitations. To address this, we propose a tournament-style approach for large-scale attribution. Evaluating this approach on datasets of C++ (500 authors, 26,355 samples) and Java (686 authors, 55,267 samples) code from GitHub, we achieve classification accuracy of up to 65% for C++ and 68.7% for Java using only one reference per author. These results open new possibilities for applying LLMs to code authorship attribution in cybersecurity and software engineering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08165v1-abstract-full').style.display = 'none'; document.getElementById('2501.08165v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures,</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07166">arXiv:2501.07166</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.07166">pdf</a>, <a href="https://arxiv.org/format/2501.07166">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3627673.3679529">10.1145/3627673.3679529 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Natural Language-Assisted Multi-modal Medication Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+K">Kangfei Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Bian%2C+T">Tian Bian</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+T">Tingyang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+J">Junzhou Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07166v1-abstract-short" style="display: inline;"> Combinatorial medication recommendation(CMR) is a fundamental task of healthcare, which offers opportunities for clinical physicians to provide more precise prescriptions for patients with intricate health conditions, particularly in the scenarios of long-term medical care. Previous research efforts have sought to extract meaningful information from electronic health records (EHRs) to facilitate c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07166v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07166v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07166v1-abstract-full" style="display: none;"> Combinatorial medication recommendation(CMR) is a fundamental task of healthcare, which offers opportunities for clinical physicians to provide more precise prescriptions for patients with intricate health conditions, particularly in the scenarios of long-term medical care. Previous research efforts have sought to extract meaningful information from electronic health records (EHRs) to facilitate combinatorial medication recommendations. Existing learning-based approaches further consider the chemical structures of medications, but ignore the textual medication descriptions in which the functionalities are clearly described. Furthermore, the textual knowledge derived from the EHRs of patients remains largely underutilized. To address these issues, we introduce the Natural Language-Assisted Multi-modal Medication Recommendation(NLA-MMR), a multi-modal alignment framework designed to learn knowledge from the patient view and medication view jointly. Specifically, NLA-MMR formulates CMR as an alignment problem from patient and medication modalities. In this vein, we employ pretrained language models(PLMs) to extract in-domain knowledge regarding patients and medications, serving as the foundational representation for both modalities. In the medication modality, we exploit both chemical structures and textual descriptions to create medication representations. In the patient modality, we generate the patient representations based on textual descriptions of diagnosis, procedure, and symptom. Extensive experiments conducted on three publicly accessible datasets demonstrate that NLA-MMR achieves new state-of-the-art performance, with a notable average improvement of 4.72% in Jaccard score. Our source code is publicly available on https://github.com/jtan1102/NLA-MMR_CIKM_2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07166v1-abstract-full').style.display = 'none'; document.getElementById('2501.07166v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 33rd ACM International Conference on Information and Knowledge Management, Boise, ID, USA, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03829">arXiv:2501.03829</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03829">pdf</a>, <a href="https://arxiv.org/format/2501.03829">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Spectral-Aware Low-Rank Adaptation for Speaker Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhe Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mak%2C+M">Man-wai Mak</a>, <a href="/search/cs?searchtype=author&amp;query=Pilanci%2C+M">Mert Pilanci</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03829v4-abstract-short" style="display: inline;"> Previous research has shown that the principal singular vectors of a pre-trained model&#39;s weight matrices capture critical knowledge. In contrast, those associated with small singular values may contain noise or less reliable information. As a result, the LoRA-based parameter-efficient fine-tuning (PEFT) approach, which does not constrain the use of the spectral space, may not be effective for task&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03829v4-abstract-full').style.display = 'inline'; document.getElementById('2501.03829v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03829v4-abstract-full" style="display: none;"> Previous research has shown that the principal singular vectors of a pre-trained model&#39;s weight matrices capture critical knowledge. In contrast, those associated with small singular values may contain noise or less reliable information. As a result, the LoRA-based parameter-efficient fine-tuning (PEFT) approach, which does not constrain the use of the spectral space, may not be effective for tasks that demand high representation capacity. In this study, we enhance existing PEFT techniques by incorporating the spectral information of pre-trained weight matrices into the fine-tuning process. We investigate spectral adaptation strategies with a particular focus on the additive adjustment of top singular vectors. This is accomplished by applying singular value decomposition (SVD) to the pre-trained weight matrices and restricting the fine-tuning within the top spectral space. Extensive speaker verification experiments on VoxCeleb1 and CN-Celeb1 demonstrate enhanced tuning performance with the proposed approach. Code is released at https://github.com/lizhepolyu/SpectralFT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03829v4-abstract-full').style.display = 'none'; document.getElementById('2501.03829v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03727">arXiv:2501.03727</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.03727">pdf</a>, <a href="https://arxiv.org/format/2501.03727">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Detecting Neurocognitive Disorders through Analyses of Topic Evolution and Cross-modal Consistency in Visual-Stimulated Narratives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jinchao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+S">Simon Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Mak%2C+B">Brian Mak</a>, <a href="/search/cs?searchtype=author&amp;query=Fung%2C+H">Helene Fung</a>, <a href="/search/cs?searchtype=author&amp;query=Woo%2C+J">Jean Woo</a>, <a href="/search/cs?searchtype=author&amp;query=Mak%2C+M">Man-Wai Mak</a>, <a href="/search/cs?searchtype=author&amp;query=Kwok%2C+T">Timothy Kwok</a>, <a href="/search/cs?searchtype=author&amp;query=Mok%2C+V">Vincent Mok</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+X">Xianmin Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+P">Patrick Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03727v1-abstract-short" style="display: inline;"> Early detection of neurocognitive disorders (NCDs) is crucial for timely intervention and disease management. Speech analysis offers a non-intrusive and scalable screening method, particularly through narrative tasks in neuropsychological assessment tools. Traditional narrative analysis often focuses on local indicators in microstructure, such as word usage and syntax. While these features provide&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03727v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03727v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03727v1-abstract-full" style="display: none;"> Early detection of neurocognitive disorders (NCDs) is crucial for timely intervention and disease management. Speech analysis offers a non-intrusive and scalable screening method, particularly through narrative tasks in neuropsychological assessment tools. Traditional narrative analysis often focuses on local indicators in microstructure, such as word usage and syntax. While these features provide insights into language production abilities, they often fail to capture global narrative patterns, or microstructures. Macrostructures include coherence, thematic organization, and logical progressions, reflecting essential cognitive skills potentially critical for recognizing NCDs. Addressing this gap, we propose to investigate specific cognitive and linguistic challenges by analyzing topical shifts, temporal dynamics, and the coherence of narratives over time, aiming to reveal cognitive deficits by identifying narrative impairments, and exploring their impact on communication and cognition. The investigation is based on the CU-MARVEL Rabbit Story corpus, which comprises recordings of a story-telling task from 758 older adults. We developed two approaches: the Dynamic Topic Models (DTM)-based temporal analysis to examine the evolution of topics over time, and the Text-Image Temporal Alignment Network (TITAN) to evaluate the coherence between spoken narratives and visual stimuli. DTM-based approach validated the effectiveness of dynamic topic consistency as a macrostructural metric (F1=0.61, AUC=0.78). The TITAN approach achieved the highest performance (F1=0.72, AUC=0.81), surpassing established microstructural and macrostructural feature sets. Cross-comparison and regression tasks further demonstrated the effectiveness of proposed dynamic macrostructural modeling approaches for NCD detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03727v1-abstract-full').style.display = 'none'; document.getElementById('2501.03727v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01103">arXiv:2501.01103</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.01103">pdf</a>, <a href="https://arxiv.org/format/2501.01103">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> learning discriminative features from spectrograms using center loss for speech emotion recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+D">Dongyang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+R">Runnan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+J">Jia Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01103v1-abstract-short" style="display: inline;"> Identifying the emotional state from speech is essential for the natural interaction of the machine with the speaker. However, extracting effective features for emotion recognition is difficult, as emotions are ambiguous. We propose a novel approach to learn discriminative features from variable length spectrograms for emotion recognition by cooperating softmax cross-entropy loss and center loss t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01103v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01103v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01103v1-abstract-full" style="display: none;"> Identifying the emotional state from speech is essential for the natural interaction of the machine with the speaker. However, extracting effective features for emotion recognition is difficult, as emotions are ambiguous. We propose a novel approach to learn discriminative features from variable length spectrograms for emotion recognition by cooperating softmax cross-entropy loss and center loss together. The softmax cross-entropy loss enables features from different emotion categories separable, and center loss efficiently pulls the features belonging to the same emotion category to their center. By combining the two losses together, the discriminative power will be highly enhanced, which leads to network learning more effective features for emotion recognition. As demonstrated by the experimental results, after introducing center loss, both the unweighted accuracy and weighted accuracy are improved by over 3\% on Mel-spectrogram input, and more than 4\% on Short Time Fourier Transform spectrogram input. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01103v1-abstract-full').style.display = 'none'; document.getElementById('2501.01103v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICASSP 2019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. IEEE Int. Conf. Acoust. Speech Signal Process. (ICASSP) 2019, pp. 7405-7409 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01102">arXiv:2501.01102</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.01102">pdf</a>, <a href="https://arxiv.org/format/2501.01102">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+D">Dongyang Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+S">Shiyin Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+J">Jia Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+D">Dan Su</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+D">Dong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01102v1-abstract-short" style="display: inline;"> Grapheme-to-phoneme (G2P) conversion serves as an essential component in Chinese Mandarin text-to-speech (TTS) system, where polyphone disambiguation is the core issue. In this paper, we propose an end-to-end framework to predict the pronunciation of a polyphonic character, which accepts sentence containing polyphonic character as input in the form of Chinese character sequence without the necessi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01102v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01102v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01102v1-abstract-full" style="display: none;"> Grapheme-to-phoneme (G2P) conversion serves as an essential component in Chinese Mandarin text-to-speech (TTS) system, where polyphone disambiguation is the core issue. In this paper, we propose an end-to-end framework to predict the pronunciation of a polyphonic character, which accepts sentence containing polyphonic character as input in the form of Chinese character sequence without the necessity of any preprocessing. The proposed method consists of a pre-trained bidirectional encoder representations from Transformers (BERT) model and a neural network (NN) based classifier. The pre-trained BERT model extracts semantic features from a raw Chinese character sequence and the NN based classifier predicts the polyphonic character&#39;s pronunciation according to BERT output. In out experiments, we implemented three classifiers, a fully-connected network based classifier, a long short-term memory (LSTM) network based classifier and a Transformer block based classifier. The experimental results compared with the baseline approach based on LSTM demonstrate that, the pre-trained model extracts effective semantic features, which greatly enhances the performance of polyphone disambiguation. In addition, we also explored the impact of contextual information on polyphone disambiguation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01102v1-abstract-full').style.display = 'none'; document.getElementById('2501.01102v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at INTERSPEECH 2019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. Interspeech 2019, pp. 2090-2094 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00910">arXiv:2501.00910</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2501.00910">pdf</a>, <a href="https://arxiv.org/format/2501.00910">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Population Aware Diffusion for Time Series Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Han Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Bi%2C+Z">Zhenyu Bi</a>, <a href="/search/cs?searchtype=author&amp;query=Urnes%2C+I+T">Ingolv T. Urnes</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Haipeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00910v1-abstract-short" style="display: inline;"> Diffusion models have shown promising ability in generating high-quality time series (TS) data. Despite the initial success, existing works mostly focus on the authenticity of data at the individual level, but pay less attention to preserving the population-level properties on the entire dataset. Such population-level properties include value distributions for each dimension and distributions of c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00910v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00910v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00910v1-abstract-full" style="display: none;"> Diffusion models have shown promising ability in generating high-quality time series (TS) data. Despite the initial success, existing works mostly focus on the authenticity of data at the individual level, but pay less attention to preserving the population-level properties on the entire dataset. Such population-level properties include value distributions for each dimension and distributions of certain functional dependencies (e.g., cross-correlation, CC) between different dimensions. For instance, when generating house energy consumption TS data, the value distributions of the outside temperature and the kitchen temperature should be preserved, as well as the distribution of CC between them. Preserving such TS population-level properties is critical in maintaining the statistical insights of the datasets, mitigating model bias, and augmenting downstream tasks like TS prediction. Yet, it is often overlooked by existing models. Hence, data generated by existing models often bear distribution shifts from the original data. We propose Population-aware Diffusion for Time Series (PaD-TS), a new TS generation model that better preserves the population-level properties. The key novelties of PaD-TS include 1) a new training method explicitly incorporating TS population-level property preservation, and 2) a new dual-channel encoder model architecture that better captures the TS data structure. Empirical results in major benchmark datasets show that PaD-TS can improve the average CC distribution shift score between real and synthetic data by 5.9x while maintaining a performance comparable to state-of-the-art models on individual-level authenticity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00910v1-abstract-full').style.display = 'none'; document.getElementById('2501.00910v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at AAAI-2025, 8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20942">arXiv:2412.20942</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.20942">pdf</a>, <a href="https://arxiv.org/format/2412.20942">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Ontology-grounded Automatic Knowledge Graph Construction by LLM under Wikidata schema </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xiaohan Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20942v1-abstract-short" style="display: inline;"> We propose an ontology-grounded approach to Knowledge Graph (KG) construction using Large Language Models (LLMs) on a knowledge base. An ontology is authored by generating Competency Questions (CQ) on knowledge base to discover knowledge scope, extracting relations from CQs, and attempt to replace equivalent relations by their counterpart in Wikidata. To ensure consistency and interpretability in&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20942v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20942v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20942v1-abstract-full" style="display: none;"> We propose an ontology-grounded approach to Knowledge Graph (KG) construction using Large Language Models (LLMs) on a knowledge base. An ontology is authored by generating Competency Questions (CQ) on knowledge base to discover knowledge scope, extracting relations from CQs, and attempt to replace equivalent relations by their counterpart in Wikidata. To ensure consistency and interpretability in the resulting KG, we ground generation of KG with the authored ontology based on extracted relations. Evaluation on benchmark datasets demonstrates competitive performance in knowledge graph construction task. Our work presents a promising direction for scalable KG construction pipeline with minimal human intervention, that yields high quality and human-interpretable KGs, which are interoperable with Wikidata semantics for potential knowledge base expansion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20942v1-abstract-full').style.display = 'none'; document.getElementById('2412.20942v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at HI-AI@KDD, Human-Interpretable AI Workshop at the KDD 2024, 26th of August 2024, Barcelona, Spain</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> CEUR Workshop Proceedings 3841 (2024) 117-135 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19067">arXiv:2412.19067</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.19067">pdf</a>, <a href="https://arxiv.org/format/2412.19067">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Learning Monocular Depth from Events via Egomotion Compensation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haitao Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+C">Chonghao Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Sheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=JunJia%2C+L">Lian JunJia</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+W">Wenwei Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Bing%2C+Z">Zhenshan Bing</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Gang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Knoll%2C+A">Alois Knoll</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19067v1-abstract-short" style="display: inline;"> Event cameras are neuromorphically inspired sensors that sparsely and asynchronously report brightness changes. Their unique characteristics of high temporal resolution, high dynamic range, and low power consumption make them well-suited for addressing challenges in monocular depth estimation (e.g., high-speed or low-lighting conditions). However, current existing methods primarily treat event str&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19067v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19067v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19067v1-abstract-full" style="display: none;"> Event cameras are neuromorphically inspired sensors that sparsely and asynchronously report brightness changes. Their unique characteristics of high temporal resolution, high dynamic range, and low power consumption make them well-suited for addressing challenges in monocular depth estimation (e.g., high-speed or low-lighting conditions). However, current existing methods primarily treat event streams as black-box learning systems without incorporating prior physical principles, thus becoming over-parameterized and failing to fully exploit the rich temporal information inherent in event camera data. To address this limitation, we incorporate physical motion principles to propose an interpretable monocular depth estimation framework, where the likelihood of various depth hypotheses is explicitly determined by the effect of motion compensation. To achieve this, we propose a Focus Cost Discrimination (FCD) module that measures the clarity of edges as an essential indicator of focus level and integrates spatial surroundings to facilitate cost estimation. Furthermore, we analyze the noise patterns within our framework and improve it with the newly introduced Inter-Hypotheses Cost Aggregation (IHCA) module, where the cost volume is refined through cost trend prediction and multi-scale cost consistency constraints. Extensive experiments on real-world and synthetic datasets demonstrate that our proposed framework outperforms cutting-edge methods by up to 10\% in terms of the absolute relative error metric, revealing superior performance in predicting accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19067v1-abstract-full').style.display = 'none'; document.getElementById('2412.19067v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18832">arXiv:2412.18832</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.18832">pdf</a>, <a href="https://arxiv.org/format/2412.18832">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Structured Speaker-Deficiency Adaptation of Foundation Models for Dysarthric and Elderly Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guinan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18832v1-abstract-short" style="display: inline;"> Data-intensive fine-tuning of speech foundation models (SFMs) to scarce and diverse dysarthric and elderly speech leads to data bias and poor generalization to unseen speakers. This paper proposes novel structured speaker-deficiency adaptation approaches for SSL pre-trained SFMs on such data. Speaker and speech deficiency invariant SFMs were constructed in their supervised adaptive fine-tuning sta&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18832v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18832v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18832v1-abstract-full" style="display: none;"> Data-intensive fine-tuning of speech foundation models (SFMs) to scarce and diverse dysarthric and elderly speech leads to data bias and poor generalization to unseen speakers. This paper proposes novel structured speaker-deficiency adaptation approaches for SSL pre-trained SFMs on such data. Speaker and speech deficiency invariant SFMs were constructed in their supervised adaptive fine-tuning stage to reduce undue bias to training data speakers, and serves as a more neutral and robust starting point for test time unsupervised adaptation. Speech variability attributed to speaker identity and speech impairment severity, or aging induced neurocognitive decline, are modelled using separate adapters that can be combined together to model any seen or unseen speaker. Experiments on the UASpeech dysarthric and DementiaBank Pitt elderly speech corpora suggest structured speaker-deficiency adaptation of HuBERT and Wav2vec2-conformer models consistently outperforms baseline SFMs using either: a) no adapters; b) global adapters shared among all speakers; or c) single attribute adapters modelling speaker or deficiency labels alone by statistically significant WER reductions up to 3.01% and 1.50% absolute (10.86% and 6.94% relative) on the two tasks respectively. The lowest published WER of 19.45% (49.34% on very low intelligibility, 33.17% on unseen words) is obtained on the UASpeech test set of 16 dysarthric speakers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18832v1-abstract-full').style.display = 'none'; document.getElementById('2412.18832v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06332">arXiv:2412.06332</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2412.06332">pdf</a>, <a href="https://arxiv.org/format/2412.06332">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Not All Errors Are Equal: Investigation of Speech Recognition Errors in Alzheimer&#39;s Disease Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jinchao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06332v1-abstract-short" style="display: inline;"> Automatic Speech Recognition (ASR) plays an important role in speech-based automatic detection of Alzheimer&#39;s disease (AD). However, recognition errors could propagate downstream, potentially impacting the detection decisions. Recent studies have revealed a non-linear relationship between word error rates (WER) and AD detection performance, where ASR transcriptions with notable errors could still&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06332v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06332v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06332v1-abstract-full" style="display: none;"> Automatic Speech Recognition (ASR) plays an important role in speech-based automatic detection of Alzheimer&#39;s disease (AD). However, recognition errors could propagate downstream, potentially impacting the detection decisions. Recent studies have revealed a non-linear relationship between word error rates (WER) and AD detection performance, where ASR transcriptions with notable errors could still yield AD detection accuracy equivalent to that based on manual transcriptions. This work presents a series of analyses to explore the effect of ASR transcription errors in BERT-based AD detection systems. Our investigation reveals that not all ASR errors contribute equally to detection performance. Certain words, such as stopwords, despite constituting a large proportion of errors, are shown to play a limited role in distinguishing AD. In contrast, the keywords related to diagnosis tasks exhibit significantly greater importance relative to other words. These findings provide insights into the interplay between ASR errors and the downstream detection model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06332v1-abstract-full').style.display = 'none'; document.getElementById('2412.06332v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18922">arXiv:2411.18922</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18922">pdf</a>, <a href="https://arxiv.org/format/2411.18922">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Devising a Set of Compact and Explainable Spoken Language Feature for Screening Alzheimer&#39;s Disease </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Junan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yunxiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuren Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18922v1-abstract-short" style="display: inline;"> Alzheimer&#39;s disease (AD) has become one of the most significant health challenges in an aging society. The use of spoken language-based AD detection methods has gained prevalence due to their scalability due to their scalability. Based on the Cookie Theft picture description task, we devised an explainable and effective feature set that leverages the visual capabilities of a large language model (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18922v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18922v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18922v1-abstract-full" style="display: none;"> Alzheimer&#39;s disease (AD) has become one of the most significant health challenges in an aging society. The use of spoken language-based AD detection methods has gained prevalence due to their scalability due to their scalability. Based on the Cookie Theft picture description task, we devised an explainable and effective feature set that leverages the visual capabilities of a large language model (LLM) and the Term Frequency-Inverse Document Frequency (TF-IDF) model. Our experimental results show that the newly proposed features consistently outperform traditional linguistic features across two different classifiers with high dimension efficiency. Our new features can be well explained and interpreted step by step which enhance the interpretability of automatic AD screening. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18922v1-abstract-full').style.display = 'none'; document.getElementById('2411.18922v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18266">arXiv:2411.18266</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18266">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Wearable intelligent throat enables natural speech in stroke patients with dysarthria </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Chenyu Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Cong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yi%2C+W">Wentian Yi</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yuxuan Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+X">Xiaoxue Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+S">Sixuan Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hongbei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+M">Muzi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shengbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenxi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hongyun Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+N">Ningli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wenyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jin Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+X">Xiaodong Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yu Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+W">Wenhui Song</a>, <a href="/search/cs?searchtype=author&amp;query=Birchall%2C+M">Martin Birchall</a>, <a href="/search/cs?searchtype=author&amp;query=Occhipinti%2C+L+G">Luigi G. Occhipinti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18266v2-abstract-short" style="display: inline;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to ena&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v2-abstract-full').style.display = 'inline'; document.getElementById('2411.18266v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18266v2-abstract-full" style="display: none;"> Wearable silent speech systems hold significant potential for restoring communication in patients with speech impairments. However, seamless, coherent speech remains elusive, and clinical efficacy is still unproven. Here, we present an AI-driven intelligent throat (IT) system that integrates throat muscle vibrations and carotid pulse signal sensors with large language model (LLM) processing to enable fluent, emotionally expressive communication. The system utilizes ultrasensitive textile strain sensors to capture high-quality signals from the neck area and supports token-level processing for real-time, continuous speech decoding, enabling seamless, delay-free communication. In tests with five stroke patients with dysarthria, IT&#39;s LLM agents intelligently corrected token errors and enriched sentence-level emotional and logical coherence, achieving low error rates (4.2% word error rate, 2.9% sentence error rate) and a 55% increase in user satisfaction. This work establishes a portable, intuitive communication platform for patients with dysarthria with the potential to be applied broadly across different neurological conditions and in multi-language support systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18266v2-abstract-full').style.display = 'none'; document.getElementById('2411.18266v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 figures, 45 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08742">arXiv:2411.08742</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08742">pdf</a>, <a href="https://arxiv.org/format/2411.08742">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study of Discrete Speech Tokens for Semantic-Related Tasks with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Dingdong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xueyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08742v1-abstract-short" style="display: inline;"> With the rise of Speech Large Language Models (Speech LLMs), there has been growing interest in discrete speech tokens for their ability to integrate with text-based tokens seamlessly. Compared to most studies that focus on continuous speech features, although discrete-token based LLMs have shown promising results on certain tasks, the performance gap between these two paradigms is rarely explored&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08742v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08742v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08742v1-abstract-full" style="display: none;"> With the rise of Speech Large Language Models (Speech LLMs), there has been growing interest in discrete speech tokens for their ability to integrate with text-based tokens seamlessly. Compared to most studies that focus on continuous speech features, although discrete-token based LLMs have shown promising results on certain tasks, the performance gap between these two paradigms is rarely explored. In this paper, we present a fair and thorough comparison between discrete and continuous features across a variety of semantic-related tasks using a light-weight LLM (Qwen1.5-0.5B). Our findings reveal that continuous features generally outperform discrete tokens, particularly in tasks requiring fine-grained semantic understanding. Moreover, this study goes beyond surface-level comparison by identifying key factors behind the under-performance of discrete tokens, such as limited token granularity and inefficient information retention. To enhance the performance of discrete tokens, we explore potential aspects based on our analysis. We hope our results can offer new insights into the opportunities for advancing discrete speech tokens in Speech LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08742v1-abstract-full').style.display = 'none'; document.getElementById('2411.08742v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 tables, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07563">arXiv:2411.07563</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07563">pdf</a>, <a href="https://arxiv.org/format/2411.07563">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Grapheme-to-Phoneme Conversion through In-Context Knowledge Retrieval with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Han%2C+D">Dongrui Han</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07563v1-abstract-short" style="display: inline;"> Grapheme-to-phoneme (G2P) conversion is a crucial step in Text-to-Speech (TTS) systems, responsible for mapping grapheme to corresponding phonetic representations. However, it faces ambiguities problems where the same grapheme can represent multiple phonemes depending on contexts, posing a challenge for G2P conversion. Inspired by the remarkable success of Large Language Models (LLMs) in handling&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07563v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07563v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07563v1-abstract-full" style="display: none;"> Grapheme-to-phoneme (G2P) conversion is a crucial step in Text-to-Speech (TTS) systems, responsible for mapping grapheme to corresponding phonetic representations. However, it faces ambiguities problems where the same grapheme can represent multiple phonemes depending on contexts, posing a challenge for G2P conversion. Inspired by the remarkable success of Large Language Models (LLMs) in handling context-aware scenarios, contextual G2P conversion systems with LLMs&#39; in-context knowledge retrieval (ICKR) capabilities are proposed to promote disambiguation capability. The efficacy of incorporating ICKR into G2P conversion systems is demonstrated thoroughly on the Librig2p dataset. In particular, the best contextual G2P conversion system using ICKR outperforms the baseline with weighted average phoneme error rate (PER) reductions of 2.0% absolute (28.9% relative). Using GPT-4 in the ICKR system can increase of 3.5% absolute (3.8% relative) on the Librig2p dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07563v1-abstract-full').style.display = 'none'; document.getElementById('2411.07563v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03172">arXiv:2411.03172</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.03172">pdf</a>, <a href="https://arxiv.org/format/2411.03172">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Blind Estimation of Sub-band Acoustic Parameters from Ambisonics Recordings using Spectro-Spatial Covariance Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hanyu Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Breebaart%2C+J">Jeroen Breebaart</a>, <a href="/search/cs?searchtype=author&amp;query=Stoddard%2C+J">Jeremy Stoddard</a>, <a href="/search/cs?searchtype=author&amp;query=Sethu%2C+V">Vidhyasaharan Sethu</a>, <a href="/search/cs?searchtype=author&amp;query=Ambikairajah%2C+E">Eliathamby Ambikairajah</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03172v2-abstract-short" style="display: inline;"> Estimating frequency-varying acoustic parameters is essential for enhancing immersive perception in realistic spatial audio creation. In this paper, we propose a unified framework that blindly estimates reverberation time (T60), direct-to-reverberant ratio (DRR), and clarity (C50) across 10 frequency bands using first-order Ambisonics (FOA) speech recordings as inputs. The proposed framework utili&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03172v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03172v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03172v2-abstract-full" style="display: none;"> Estimating frequency-varying acoustic parameters is essential for enhancing immersive perception in realistic spatial audio creation. In this paper, we propose a unified framework that blindly estimates reverberation time (T60), direct-to-reverberant ratio (DRR), and clarity (C50) across 10 frequency bands using first-order Ambisonics (FOA) speech recordings as inputs. The proposed framework utilizes a novel feature named Spectro-Spatial Covariance Vector (SSCV), efficiently representing temporal, spectral as well as spatial information of the FOA signal. Our models significantly outperform existing single-channel methods with only spectral information, reducing estimation errors by more than half for all three acoustic parameters. Additionally, we introduce FOA-Conv3D, a novel back-end network for effectively utilising the SSCV feature with a 3D convolutional encoder. FOA-Conv3D outperforms the convolutional neural network (CNN) and recurrent convolutional neural network (CRNN) backends, achieving lower estimation errors and accounting for a higher proportion of variance (PoV) for all 3 acoustic parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03172v2-abstract-full').style.display = 'none'; document.getElementById('2411.03172v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20130">arXiv:2410.20130</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.20130">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> The Dark Side of AI Companionship: A Taxonomy of Harmful Algorithmic Behaviors in Human-AI Relationships </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+R">Renwen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Han Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+J">Jinyuan Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Gan%2C+H">Hongyuan Gan</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+Y">Yi-Chieh Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20130v3-abstract-short" style="display: inline;"> As conversational AI systems increasingly permeate the socio-emotional realms of human life, they bring both benefits and risks to individuals and society. Despite extensive research on detecting and categorizing harms in AI systems, less is known about the harms that arise from social interactions with AI chatbots. Through a mixed-methods analysis of 35,390 conversation excerpts shared on r/repli&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20130v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20130v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20130v3-abstract-full" style="display: none;"> As conversational AI systems increasingly permeate the socio-emotional realms of human life, they bring both benefits and risks to individuals and society. Despite extensive research on detecting and categorizing harms in AI systems, less is known about the harms that arise from social interactions with AI chatbots. Through a mixed-methods analysis of 35,390 conversation excerpts shared on r/replika, an online community for users of the AI companion Replika, we identified six categories of harmful behaviors exhibited by the chatbot: relational transgression, verbal abuse and hate, self-inflicted harm, harassment and violence, mis/disinformation, and privacy violations. The AI contributes to these harms through four distinct roles: perpetrator, instigator, facilitator, and enabler. Our findings highlight the relational harms of AI chatbots and the danger of algorithmic compliance, enhancing the understanding of AI harms in socio-emotional interactions. We also provide suggestions for designing ethical and responsible AI systems that prioritize user safety and well-being. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20130v3-abstract-full').style.display = 'none'; document.getElementById('2410.20130v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18415">arXiv:2410.18415</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.18415">pdf</a>, <a href="https://arxiv.org/format/2410.18415">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Decoding on Graphs: Faithful and Sound Reasoning on Knowledge Graphs through Generation of Well-Formed Chains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Kun Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tianhua Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+H">Hongyin Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Glass%2C+J">James Glass</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18415v1-abstract-short" style="display: inline;"> Knowledge Graphs (KGs) can serve as reliable knowledge sources for question answering (QA) due to their structured representation of knowledge. Existing research on the utilization of KG for large language models (LLMs) prevalently relies on subgraph retriever or iterative prompting, overlooking the potential synergy of LLMs&#39; step-wise reasoning capabilities and KGs&#39; structural nature. In this pap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18415v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18415v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18415v1-abstract-full" style="display: none;"> Knowledge Graphs (KGs) can serve as reliable knowledge sources for question answering (QA) due to their structured representation of knowledge. Existing research on the utilization of KG for large language models (LLMs) prevalently relies on subgraph retriever or iterative prompting, overlooking the potential synergy of LLMs&#39; step-wise reasoning capabilities and KGs&#39; structural nature. In this paper, we present DoG (Decoding on Graphs), a novel framework that facilitates a deep synergy between LLMs and KGs. We first define a concept, well-formed chain, which consists of a sequence of interrelated fact triplets on the KGs, starting from question entities and leading to answers. We argue that this concept can serve as a principle for making faithful and sound reasoning for KGQA. To enable LLMs to generate well-formed chains, we propose graph-aware constrained decoding, in which a constraint derived from the topology of the KG regulates the decoding process of the LLMs. This constrained decoding method ensures the generation of well-formed chains while making full use of the step-wise reasoning capabilities of LLMs. Based on the above, DoG, a training-free approach, is able to provide faithful and sound reasoning trajectories grounded on the KGs. Experiments across various KGQA tasks with different background KGs demonstrate that DoG achieves superior and robust performance. DoG also shows general applicability with various open-source LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18415v1-abstract-full').style.display = 'none'; document.getElementById('2410.18415v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07542">arXiv:2410.07542</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.07542">pdf</a>, <a href="https://arxiv.org/format/2410.07542">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Indoor Human Activity Recognition Method Based on Micro-Doppler Corner Point Cloud and Dynamic Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xiaopeng Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+W">Weicheng Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+X">Xiaodong Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haoyu Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07542v1-abstract-short" style="display: inline;"> Through-the-wall radar (TWR) human activity recognition can be achieved by fusing micro-Doppler signature extraction and intelligent decision-making algorithms. However, limited by the insufficient priori of tester in practical indoor scenarios, the trained models on one tester are commonly difficult to inference well on other testers, which causes poor generalization ability. To solve this proble&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07542v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07542v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07542v1-abstract-full" style="display: none;"> Through-the-wall radar (TWR) human activity recognition can be achieved by fusing micro-Doppler signature extraction and intelligent decision-making algorithms. However, limited by the insufficient priori of tester in practical indoor scenarios, the trained models on one tester are commonly difficult to inference well on other testers, which causes poor generalization ability. To solve this problem, this paper proposes a generalizable indoor human activity recognition method based on micro-Doppler corner point cloud and dynamic graph learning. In the proposed method, DoG-渭D-CornerDet is used for micro-Doppler corner extraction on two types of radar profiles. Then, a micro-Doppler corner filtering method based on polynomial fitting smoothing is proposed to maximize the feature distance under the constraints of the kinematic model. The extracted corners from the two types of radar profiles are concatenated together into three-dimensional point cloud. Finally, the paper proposes a dynamic graph neural network (DGNN)-based recognition method for data-to-activity label mapping. Visualization, comparison and ablation experiments are carried out to verify the effectiveness of the proposed method. The results prove that the proposed method has strong generalization ability on radar data collected from different testers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07542v1-abstract-full').style.display = 'none'; document.getElementById('2410.07542v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 12 figures, 6 tables, in IEEE Transactions on Aerospace and Electronics Systems, 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 94 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.5.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16322">arXiv:2409.16322</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.16322">pdf</a>, <a href="https://arxiv.org/format/2409.16322">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Towards Within-Class Variation in Alzheimer&#39;s Disease Detection from Spontaneous Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+D">Dongrui Han</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jingyan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jinchao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16322v1-abstract-short" style="display: inline;"> Alzheimer&#39;s Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detectio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16322v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16322v1-abstract-full" style="display: none;"> Alzheimer&#39;s Disease (AD) detection has emerged as a promising research area that employs machine learning classification models to distinguish between individuals with AD and those without. Unlike conventional classification tasks, we identify within-class variation as a critical challenge in AD detection: individuals with AD exhibit a spectrum of cognitive impairments. Given that many AD detection tasks lack fine-grained labels, simplistic binary classification may overlook two crucial aspects: within-class differences and instance-level imbalance. The former compels the model to map AD samples with varying degrees of impairment to a single diagnostic label, disregarding certain changes in cognitive function. While the latter biases the model towards overrepresented severity levels. This work presents early efforts to address these challenges. We propose two novel methods: Soft Target Distillation (SoTD) and Instance-level Re-balancing (InRe), targeting two problems respectively. Experiments on the ADReSS and ADReSSo datasets demonstrate that the proposed methods significantly improve detection accuracy. Further analysis reveals that SoTD effectively harnesses the strengths of multiple component models, while InRe substantially alleviates model over-fitting. These findings provide insights for developing more robust and reliable AD detection models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16322v1-abstract-full').style.display = 'none'; document.getElementById('2409.16322v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12560">arXiv:2409.12560</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12560">pdf</a>, <a href="https://arxiv.org/format/2409.12560">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> AudioComposer: Towards Fine-grained Audio Generation with Natural Language Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+H">Hangting Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12560v1-abstract-short" style="display: inline;"> Current Text-to-audio (TTA) models mainly use coarse text descriptions as inputs to generate audio, which hinders models from generating audio with fine-grained control of content and style. Some studies try to improve the granularity by incorporating additional frame-level conditions or control networks. However, this usually leads to complex system design and difficulties due to the requirement&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12560v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12560v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12560v1-abstract-full" style="display: none;"> Current Text-to-audio (TTA) models mainly use coarse text descriptions as inputs to generate audio, which hinders models from generating audio with fine-grained control of content and style. Some studies try to improve the granularity by incorporating additional frame-level conditions or control networks. However, this usually leads to complex system design and difficulties due to the requirement for reference frame-level conditions. To address these challenges, we propose AudioComposer, a novel TTA generation framework that relies solely on natural language descriptions (NLDs) to provide both content specification and style control information. To further enhance audio generative modeling, we employ flow-based diffusion transformers with the cross-attention mechanism to incorporate text descriptions effectively into audio generation processes, which can not only simultaneously consider the content and style information in the text inputs, but also accelerate generation compared to other architectures. Furthermore, we propose a novel and comprehensive automatic data simulation pipeline to construct data with fine-grained text descriptions, which significantly alleviates the problem of data scarcity in the area. Experiments demonstrate the effectiveness of our framework using solely NLDs as inputs for content specification and style control. The generation quality and controllability surpass state-of-the-art TTA models, even with a smaller model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12560v1-abstract-full').style.display = 'none'; document.getElementById('2409.12560v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12388">arXiv:2409.12388</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12388">pdf</a>, <a href="https://arxiv.org/format/2409.12388">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Speakers in Multi-Talker Speech Recognition with Speaker-Aware CTC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12388v2-abstract-short" style="display: inline;"> Multi-talker speech recognition (MTASR) faces unique challenges in disentangling and transcribing overlapping speech. To address these challenges, this paper investigates the role of Connectionist Temporal Classification (CTC) in speaker disentanglement when incorporated with Serialized Output Training (SOT) for MTASR. Our visualization reveals that CTC guides the encoder to represent different sp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12388v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12388v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12388v2-abstract-full" style="display: none;"> Multi-talker speech recognition (MTASR) faces unique challenges in disentangling and transcribing overlapping speech. To address these challenges, this paper investigates the role of Connectionist Temporal Classification (CTC) in speaker disentanglement when incorporated with Serialized Output Training (SOT) for MTASR. Our visualization reveals that CTC guides the encoder to represent different speakers in distinct temporal regions of acoustic embeddings. Leveraging this insight, we propose a novel Speaker-Aware CTC (SACTC) training objective, based on the Bayes risk CTC framework. SACTC is a tailored CTC variant for multi-talker scenarios, it explicitly models speaker disentanglement by constraining the encoder to represent different speakers&#39; tokens at specific time frames. When integrated with SOT, the SOT-SACTC model consistently outperforms standard SOT-CTC across various degrees of speech overlap. Specifically, we observe relative word error rate reductions of 10% overall and 15% on low-overlap speech. This work represents an initial exploration of CTC-based enhancements for MTASR tasks, offering a new perspective on speaker disentanglement in multi-talker speech recognition. The code is available at https://github.com/kjw11/Speaker-Aware-CTC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12388v2-abstract-full').style.display = 'none'; document.getElementById('2409.12388v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11630">arXiv:2409.11630</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11630">pdf</a>, <a href="https://arxiv.org/format/2409.11630">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Speaking from Coarse to Fine: Improving Neural Codec Language Model via Multi-Scale Speech Coding and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Haohan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11630v1-abstract-short" style="display: inline;"> The neural codec language model (CLM) has demonstrated remarkable performance in text-to-speech (TTS) synthesis. However, troubled by ``recency bias&#34;, CLM lacks sufficient attention to coarse-grained information at a higher temporal scale, often producing unnatural or even unintelligible speech. This work proposes CoFi-Speech, a coarse-to-fine CLM-TTS approach, employing multi-scale speech coding&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11630v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11630v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11630v1-abstract-full" style="display: none;"> The neural codec language model (CLM) has demonstrated remarkable performance in text-to-speech (TTS) synthesis. However, troubled by ``recency bias&#34;, CLM lacks sufficient attention to coarse-grained information at a higher temporal scale, often producing unnatural or even unintelligible speech. This work proposes CoFi-Speech, a coarse-to-fine CLM-TTS approach, employing multi-scale speech coding and generation to address this issue. We train a multi-scale neural codec, CoFi-Codec, to encode speech into a multi-scale discrete representation, comprising multiple token sequences with different time resolutions. Then, we propose CoFi-LM that can generate this representation in two modes: the single-LM-based chain-of-scale generation and the multiple-LM-based stack-of-scale generation. In experiments, CoFi-Speech significantly outperforms single-scale baseline systems on naturalness and speaker similarity in zero-shot TTS. The analysis of multi-scale coding demonstrates the effectiveness of CoFi-Codec in learning multi-scale discrete speech representations while keeping high-quality speech reconstruction. The coarse-to-fine multi-scale generation, especially for the stack-of-scale approach, is also validated as a crucial approach in pursuing a high-quality neural codec language model for TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11630v1-abstract-full').style.display = 'none'; document.getElementById('2409.11630v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10411">arXiv:2409.10411</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.10411">pdf</a>, <a href="https://arxiv.org/format/2409.10411">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> A Large-Scale Privacy Assessment of Android Third-Party SDKs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+M+H">Mark Huasong Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+C">Chuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Hao%2C+Y">Yun Hao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Q">Qing Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zeyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kailong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Teo%2C+S+G">Sin Gee Teo</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+G">Guangdong Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+J+S">Jin Song Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10411v1-abstract-short" style="display: inline;"> Third-party Software Development Kits (SDKs) are widely adopted in Android app development, to effortlessly accelerate development pipelines and enhance app functionality. However, this convenience raises substantial concerns about unauthorized access to users&#39; privacy-sensitive information, which could be further abused for illegitimate purposes like user tracking or monetization. Our study offer&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10411v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10411v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10411v1-abstract-full" style="display: none;"> Third-party Software Development Kits (SDKs) are widely adopted in Android app development, to effortlessly accelerate development pipelines and enhance app functionality. However, this convenience raises substantial concerns about unauthorized access to users&#39; privacy-sensitive information, which could be further abused for illegitimate purposes like user tracking or monetization. Our study offers a targeted analysis of user privacy protection among Android third-party SDKs, filling a critical gap in the Android software supply chain. It focuses on two aspects of their privacy practices, including data exfiltration and behavior-policy compliance (or privacy compliance), utilizing techniques of taint analysis and large language models. It covers 158 widely-used SDKs from two key SDK release platforms, the official one and a large alternative one. From them, we identified 338 instances of privacy data exfiltration. On the privacy compliance, our study reveals that more than 30% of the examined SDKs fail to provide a privacy policy to disclose their data handling practices. Among those that provide privacy policies, 37% of them over-collect user data, and 88% falsely claim access to sensitive data. We revisit the latest versions of the SDKs after 12 months. Our analysis demonstrates a persistent lack of improvement in these concerning trends. Based on our findings, we propose three actionable recommendations to mitigate the privacy leakage risks and enhance privacy protection for Android users. Our research not only serves as an urgent call for industry attention but also provides crucial insights for future regulatory interventions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10411v1-abstract-full').style.display = 'none'; document.getElementById('2409.10411v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08596">arXiv:2409.08596</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.08596">pdf</a>, <a href="https://arxiv.org/format/2409.08596">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Can Transcribe Speech in Multi-Talker Scenarios with Versatile Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+W">Wenxuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08596v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following ve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08596v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08596v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) have revolutionized various domains, bringing significant progress and new opportunities. Despite progress in speech-related tasks, LLMs have not been sufficiently explored in multi-talker scenarios. In this work, we present a pioneering effort to investigate the capability of LLMs in transcribing speech in multi-talker environments, following versatile instructions related to multi-talker automatic speech recognition (ASR), target talker ASR, and ASR based on specific talker attributes such as sex, occurrence order, language, and keyword spoken. Our approach utilizes WavLM and Whisper encoder to extract multi-faceted speech representations that are sensitive to speaker characteristics and semantic context. These representations are then fed into an LLM fine-tuned using LoRA, enabling the capabilities for speech comprehension and transcription. Comprehensive experiments reveal the promising performance of our proposed system, MT-LLM, in cocktail party scenarios, highlighting the potential of LLM to handle speech-related tasks based on user instructions in such complex settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08596v1-abstract-full').style.display = 'none'; document.getElementById('2409.08596v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06029">arXiv:2409.06029</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.06029">pdf</a>, <a href="https://arxiv.org/format/2409.06029">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SongCreator: Lyrics-based Universal Song Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lei%2C+S">Shun Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yixuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+B">Boshi Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Lam%2C+M+W+Y">Max W. Y. Lam</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+F">Feng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hangyu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+J">Jingcheng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+S">Shiyin Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06029v2-abstract-short" style="display: inline;"> Music is an integral part of human culture, embodying human intelligence and creativity, of which songs compose an essential part. While various aspects of song generation have been explored by previous works, such as singing voice, vocal composition and instrumental arrangement, etc., generating songs with both vocals and accompaniment given lyrics remains a significant challenge, hindering the a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06029v2-abstract-full').style.display = 'inline'; document.getElementById('2409.06029v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06029v2-abstract-full" style="display: none;"> Music is an integral part of human culture, embodying human intelligence and creativity, of which songs compose an essential part. While various aspects of song generation have been explored by previous works, such as singing voice, vocal composition and instrumental arrangement, etc., generating songs with both vocals and accompaniment given lyrics remains a significant challenge, hindering the application of music generation models in the real world. In this light, we propose SongCreator, a song-generation system designed to tackle this challenge. The model features two novel designs: a meticulously designed dual-sequence language model (DSLM) to capture the information of vocals and accompaniment for song generation, and a series of attention mask strategies for DSLM, which allows our model to understand, generate and edit songs, making it suitable for various songrelated generation tasks by utilizing specific attention masks. Extensive experiments demonstrate the effectiveness of SongCreator by achieving state-of-the-art or competitive performances on all eight tasks. Notably, it surpasses previous works by a large margin in lyrics-to-song and lyrics-to-vocals. Additionally, it is able to independently control the acoustic conditions of the vocals and accompaniment in the generated song through different audio prompts, exhibiting its potential applicability. Our samples are available at https://thuhcsi.github.io/SongCreator/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06029v2-abstract-full').style.display = 'none'; document.getElementById('2409.06029v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00933">arXiv:2409.00933</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.00933">pdf</a>, <a href="https://arxiv.org/format/2409.00933">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SoCodec: A Semantic-Ordered Multi-Stream Speech Codec for Efficient Language Model Based Text-to-Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Haohan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+F">Fenglong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+K">Kun Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+D">Dake Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00933v1-abstract-short" style="display: inline;"> The long speech sequence has been troubling language models (LM) based TTS approaches in terms of modeling complexity and efficiency. This work proposes SoCodec, a semantic-ordered multi-stream speech codec, to address this issue. It compresses speech into a shorter, multi-stream discrete semantic sequence with multiple tokens at each frame. Meanwhile, the ordered product quantization is proposed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00933v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00933v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00933v1-abstract-full" style="display: none;"> The long speech sequence has been troubling language models (LM) based TTS approaches in terms of modeling complexity and efficiency. This work proposes SoCodec, a semantic-ordered multi-stream speech codec, to address this issue. It compresses speech into a shorter, multi-stream discrete semantic sequence with multiple tokens at each frame. Meanwhile, the ordered product quantization is proposed to constrain this sequence into an ordered representation. It can be applied with a multi-stream delayed LM to achieve better autoregressive generation along both time and stream axes in TTS. The experimental result strongly demonstrates the effectiveness of the proposed approach, achieving superior performance over baseline systems even if compressing the frameshift of speech from 20ms to 240ms (12x). The ablation studies further validate the importance of learning the proposed ordered multi-stream semantic representation in pursuing shorter speech sequences for efficient LM-based TTS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00933v1-abstract-full').style.display = 'none'; document.getElementById('2409.00933v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14357">arXiv:2408.14357</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.14357">pdf</a>, <a href="https://arxiv.org/format/2408.14357">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Exploring ChatGPT App Ecosystem: Distribution, Deployment and Security </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yan%2C+C">Chuan Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+R">Ruomai Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+M+H">Mark Huasong Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+L">Liuhuo Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Ooi%2C+T+Y">Tian Yang Ooi</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+G">Guangdong Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14357v1-abstract-short" style="display: inline;"> ChatGPT has enabled third-party developers to create plugins to expand ChatGPT&#39;s capabilities.These plugins are distributed through OpenAI&#39;s plugin store, making them easily accessible to users. With ChatGPT as the backbone, this app ecosystem has illustrated great business potential by offering users personalized services in a conversational manner. Nonetheless, many crucial aspects regarding app&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14357v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14357v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14357v1-abstract-full" style="display: none;"> ChatGPT has enabled third-party developers to create plugins to expand ChatGPT&#39;s capabilities.These plugins are distributed through OpenAI&#39;s plugin store, making them easily accessible to users. With ChatGPT as the backbone, this app ecosystem has illustrated great business potential by offering users personalized services in a conversational manner. Nonetheless, many crucial aspects regarding app development, deployment, and security of this ecosystem have yet to be thoroughly studied in the research community, potentially hindering a broader adoption by both developers and users. In this work, we conduct the first comprehensive study of the ChatGPT app ecosystem, aiming to illuminate its landscape for our research community. Our study examines the distribution and deployment models in the integration of LLMs and third-party apps, and assesses their security and privacy implications. We uncover an uneven distribution of functionality among ChatGPT plugins, highlighting prevalent and emerging topics. We also identify severe flaws in the authentication and user data protection for third-party app APIs integrated within LLMs, revealing a concerning status quo of security and privacy in this app ecosystem. Our work provides insights for the secure and sustainable development of this rapidly evolving ecosystem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14357v1-abstract-full').style.display = 'none'; document.getElementById('2408.14357v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 39th IEEE/ACM International Conference on Automated Software Engineering (ASE 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13893">arXiv:2408.13893</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.13893">pdf</a>, <a href="https://arxiv.org/format/2408.13893">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dongchao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Rongjie Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+H">Haohan Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Chong%2C+D">Dading Chong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Songxiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13893v2-abstract-short" style="display: inline;"> Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as an effective method for improving the diversity and naturalness of synthesized speech. At the high level, previous large-scale TTS models can be categorized into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). Although these works dem&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13893v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13893v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13893v2-abstract-full" style="display: none;"> Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as an effective method for improving the diversity and naturalness of synthesized speech. At the high level, previous large-scale TTS models can be categorized into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). Although these works demonstrate good performance, they still have potential weaknesses. For instance, AR-based models are plagued by unstable generation quality and slow generation speed; meanwhile, some NAR-based models need phoneme-level duration alignment information, thereby increasing the complexity of data pre-processing, model design, and loss design. In this work, we build upon our previous publication by implementing a simple and efficient non-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2 effectively combines the strengths of both autoregressive (AR) and non-autoregressive (NAR) methods, offering the following key advantages: (1) simplified data preparation; (2) straightforward model and loss design; and (3) stable, high-quality generation performance with fast inference speed. Compared to our previous publication, we present ({\romannumeral1}) a detailed analysis of the influence of speech tokenizer and noisy label for TTS performance; ({\romannumeral2}) four distinct types of sentence duration predictors; ({\romannumeral3}) a novel flow-based scalar latent transformer diffusion model. With these improvement, we show a significant improvement in generation performance and generation speed compared to our previous work and other state-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that SimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on multilingual speech datasets. Demos are available on: {https://dongchaoyang.top/SimpleSpeech2\_demo/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13893v2-abstract-full').style.display = 'none'; document.getElementById('2408.13893v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submit to TASLP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10588">arXiv:2408.10588</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10588">pdf</a>, <a href="https://arxiv.org/format/2408.10588">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DEGAS: Detailed Expressions on Full-Body Gaussian Avatars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shao%2C+Z">Zhijing Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+D">Duotun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Q">Qing-Yao Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yao-Dong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hengyu Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Z">Zeyu Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+B">Bo Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zeyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10588v2-abstract-short" style="display: inline;"> Although neural rendering has made significant advances in creating lifelike, animatable full-body and head avatars, incorporating detailed expressions into full-body avatars remains largely unexplored. We present DEGAS, the first 3D Gaussian Splatting (3DGS)-based modeling method for full-body avatars with rich facial expressions. Trained on multiview videos of a given subject, our method learns&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10588v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10588v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10588v2-abstract-full" style="display: none;"> Although neural rendering has made significant advances in creating lifelike, animatable full-body and head avatars, incorporating detailed expressions into full-body avatars remains largely unexplored. We present DEGAS, the first 3D Gaussian Splatting (3DGS)-based modeling method for full-body avatars with rich facial expressions. Trained on multiview videos of a given subject, our method learns a conditional variational autoencoder that takes both the body motion and facial expression as driving signals to generate Gaussian maps in the UV layout. To drive the facial expressions, instead of the commonly used 3D Morphable Models (3DMMs) in 3D head avatars, we propose to adopt the expression latent space trained solely on 2D portrait images, bridging the gap between 2D talking faces and 3D avatars. Leveraging the rendering capability of 3DGS and the rich expressiveness of the expression latent space, the learned avatars can be reenacted to reproduce photorealistic rendering images with subtle and accurate facial expressions. Experiments on an existing dataset and our newly proposed dataset of full-body talking avatars demonstrate the efficacy of our method. We also propose an audio-driven extension of our method with the help of 2D talking faces, opening new possibilities for interactive AI agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10588v2-abstract-full').style.display = 'none'; document.getElementById('2408.10588v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3DV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10493">arXiv:2408.10493</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10493">pdf</a>, <a href="https://arxiv.org/format/2408.10493">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Clustering by Mining Density Distributions and Splitting Manifold Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Zhichang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+Z">Zhiguo Long</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hua Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10493v2-abstract-short" style="display: inline;"> Spectral clustering requires the time-consuming decomposition of the Laplacian matrix of the similarity graph, thus limiting its applicability to large datasets. To improve the efficiency of spectral clustering, a top-down approach was recently proposed, which first divides the data into several micro-clusters (granular-balls), then splits these micro-clusters when they are not ``compact&#39;&#39;, and fi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10493v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10493v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10493v2-abstract-full" style="display: none;"> Spectral clustering requires the time-consuming decomposition of the Laplacian matrix of the similarity graph, thus limiting its applicability to large datasets. To improve the efficiency of spectral clustering, a top-down approach was recently proposed, which first divides the data into several micro-clusters (granular-balls), then splits these micro-clusters when they are not ``compact&#39;&#39;, and finally uses these micro-clusters as nodes to construct a similarity graph for more efficient spectral clustering. However, this top-down approach is challenging to adapt to unevenly distributed or structurally complex data. This is because constructing micro-clusters as a rough ball struggles to capture the shape and structure of data in a local range, and the simplistic splitting rule that solely targets ``compactness&#39;&#39; is susceptible to noise and variations in data density and leads to micro-clusters with varying shapes, making it challenging to accurately measure the similarity between them. To resolve these issues and improve spectral clustering, this paper first proposes to start from local structures to obtain micro-clusters, such that the complex structural information inside local neighborhoods is well captured by them. Moreover, by noting that Euclidean distance is more suitable for convex sets, this paper further proposes a data splitting rule that couples local density and data manifold structures, so that the similarities of the obtained micro-clusters can be easily characterized. A novel similarity measure between micro-clusters is then proposed for the final spectral clustering. A series of experiments based on synthetic and real-world datasets demonstrate that the proposed method has better adaptability to structurally complex data than granular-ball based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10493v2-abstract-full').style.display = 'none'; document.getElementById('2408.10493v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10084">arXiv:2408.10084</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.10084">pdf</a>, <a href="https://arxiv.org/format/2408.10084">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TANGO: Clustering with Typicality-Aware Nonlocal Mode-Seeking and Graph-Cut Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ma%2C+H">Haowen Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Long%2C+Z">Zhiguo Long</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hua Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10084v1-abstract-short" style="display: inline;"> Density-based clustering methods by mode-seeking usually achieve clustering by using local density estimation to mine structural information, such as local dependencies from lower density points to higher neighbors. However, they often rely too heavily on \emph{local} structures and neglect \emph{global} characteristics, which can lead to significant errors in peak selection and dependency establi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10084v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10084v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10084v1-abstract-full" style="display: none;"> Density-based clustering methods by mode-seeking usually achieve clustering by using local density estimation to mine structural information, such as local dependencies from lower density points to higher neighbors. However, they often rely too heavily on \emph{local} structures and neglect \emph{global} characteristics, which can lead to significant errors in peak selection and dependency establishment. Although introducing more hyperparameters that revise dependencies can help mitigate this issue, tuning them is challenging and even impossible on real-world datasets. In this paper, we propose a new algorithm (TANGO) to establish local dependencies by exploiting a global-view \emph{typicality} of points, which is obtained by mining further the density distributions and initial dependencies. TANGO then obtains sub-clusters with the help of the adjusted dependencies, and characterizes the similarity between sub-clusters by incorporating path-based connectivity. It achieves final clustering by employing graph-cut on sub-clusters, thus avoiding the challenging selection of cluster centers. Moreover, this paper provides theoretical analysis and an efficient method for the calculation of typicality. Experimental results on several synthetic and $16$ real-world datasets demonstrate the effectiveness and superiority of TANGO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10084v1-abstract-full').style.display = 'none'; document.getElementById('2408.10084v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04914">arXiv:2408.04914</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.04914">pdf</a>, <a href="https://arxiv.org/format/2408.04914">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GuidedNet: Semi-Supervised Multi-Organ Segmentation via Labeled Data Guide Unlabeled Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Haochen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hui Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Deqian Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xiaozheng Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaoze Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qingfeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Niu%2C+J">Jianwei Niu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04914v2-abstract-short" style="display: inline;"> Semi-supervised multi-organ medical image segmentation aids physicians in improving disease diagnosis and treatment planning and reduces the time and effort required for organ annotation.Existing state-of-the-art methods train the labeled data with ground truths and train the unlabeled data with pseudo-labels. However, the two training flows are separate, which does not reflect the interrelationsh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04914v2-abstract-full').style.display = 'inline'; document.getElementById('2408.04914v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04914v2-abstract-full" style="display: none;"> Semi-supervised multi-organ medical image segmentation aids physicians in improving disease diagnosis and treatment planning and reduces the time and effort required for organ annotation.Existing state-of-the-art methods train the labeled data with ground truths and train the unlabeled data with pseudo-labels. However, the two training flows are separate, which does not reflect the interrelationship between labeled and unlabeled data.To address this issue, we propose a semi-supervised multi-organ segmentation method called GuidedNet, which leverages the knowledge from labeled data to guide the training of unlabeled data. The primary goals of this study are to improve the quality of pseudo-labels for unlabeled data and to enhance the network&#39;s learning capability for both small and complex organs.A key concept is that voxel features from labeled and unlabeled data that are close to each other in the feature space are more likely to belong to the same class.On this basis, a 3D Consistent Gaussian Mixture Model (3D-CGMM) is designed to leverage the feature distributions from labeled data to rectify the generated pseudo-labels.Furthermore, we introduce a Knowledge Transfer Cross Pseudo Supervision (KT-CPS) strategy, which leverages the prior knowledge obtained from the labeled data to guide the training of the unlabeled data, thereby improving the segmentation accuracy for both small and complex organs. Extensive experiments on two public datasets, FLARE22 and AMOS, demonstrated that GuidedNet is capable of achieving state-of-the-art performance. The source code with our proposed model are available at https://github.com/kimjisoo12/GuidedNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04914v2-abstract-full').style.display = 'none'; document.getElementById('2408.04914v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM MM2024, 10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04905">arXiv:2408.04905</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.04905">pdf</a>, <a href="https://arxiv.org/format/2408.04905">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3691620.3695060">10.1145/3691620.3695060 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> GlitchProber: Advancing Effective Detection and Mitigation of Glitch Tokens in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhibo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+W">Wuxia Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuxi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+M+H">Mark Huasong Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Kailong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+L">Ling Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Haoyu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04905v2-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved unprecedented success in the field of natural language processing. However, the black-box nature of their internal mechanisms has brought many concerns about their trustworthiness and interpretability. Recent research has discovered a class of abnormal tokens in the model&#39;s vocabulary space and named them &#34;glitch tokens&#34;. Those tokens, once included in th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04905v2-abstract-full').style.display = 'inline'; document.getElementById('2408.04905v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04905v2-abstract-full" style="display: none;"> Large language models (LLMs) have achieved unprecedented success in the field of natural language processing. However, the black-box nature of their internal mechanisms has brought many concerns about their trustworthiness and interpretability. Recent research has discovered a class of abnormal tokens in the model&#39;s vocabulary space and named them &#34;glitch tokens&#34;. Those tokens, once included in the input, may induce the model to produce incorrect, irrelevant, or even harmful results, drastically undermining the reliability and practicality of LLMs. In this work, we aim to enhance the understanding of glitch tokens and propose techniques for their detection and mitigation. We first reveal the characteristic features induced by glitch tokens on LLMs, which are evidenced by significant deviations in the distributions of attention patterns and dynamic information from intermediate model layers. Based on the insights, we develop GlitchProber, a tool for efficient glitch token detection and mitigation. GlitchProber utilizes small-scale sampling, principal component analysis for accelerated feature extraction, and a simple classifier for efficient vocabulary screening. Taking one step further, GlitchProber rectifies abnormal model intermediate layer values to mitigate the destructive effects of glitch tokens. Evaluated on five mainstream open-source LLMs, GlitchProber demonstrates higher efficiency, precision, and recall compared to existing approaches, with an average F1 score of 0.86 and an average repair rate of 50.06%. GlitchProber unveils a novel path to address the challenges posed by glitch tokens and inspires future research toward more robust and interpretable LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04905v2-abstract-full').style.display = 'none'; document.getElementById('2408.04905v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21467">arXiv:2407.21467</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.21467">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning-Based Longitudinal Prediction of Childhood Myopia Progression Using Fundus Image Sequences and Baseline Refraction Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kang%2C+M">Mengtian Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yansong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+S">Shuo Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yuanyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Hongbei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuemeng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Hubin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+J">Jing Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+G">Guohua Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yanning Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Nathan%2C+A">Arokia Nathan</a>, <a href="/search/cs?searchtype=author&amp;query=Smielewski%2C+P">Peter Smielewski</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+N">Ningli Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shiming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21467v1-abstract-short" style="display: inline;"> Childhood myopia constitutes a significant global health concern. It exhibits an escalating prevalence and has the potential to evolve into severe, irreversible conditions that detrimentally impact familial well-being and create substantial economic costs. Contemporary research underscores the importance of precisely predicting myopia progression to enable timely and effective interventions, there&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21467v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21467v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21467v1-abstract-full" style="display: none;"> Childhood myopia constitutes a significant global health concern. It exhibits an escalating prevalence and has the potential to evolve into severe, irreversible conditions that detrimentally impact familial well-being and create substantial economic costs. Contemporary research underscores the importance of precisely predicting myopia progression to enable timely and effective interventions, thereby averting severe visual impairment in children. Such predictions predominantly rely on subjective clinical assessments, which are inherently biased and resource-intensive, thus hindering their widespread application. In this study, we introduce a novel, high-accuracy method for quantitatively predicting the myopic trajectory and myopia risk in children using only fundus images and baseline refraction data. This approach was validated through a six-year longitudinal study of 3,408 children in Henan, utilizing 16,211 fundus images and corresponding refractive data. Our method based on deep learning demonstrated predictive accuracy with an error margin of 0.311D per year and AUC scores of 0.944 and 0.995 for forecasting the risks of developing myopia and high myopia, respectively. These findings confirm the utility of our model in supporting early intervention strategies and in significantly reducing healthcare costs, particularly by obviating the need for additional metadata and repeated consultations. Furthermore, our method was designed to rely only on fundus images and refractive error data, without the need for meta data or multiple inquiries from doctors, strongly reducing the associated medical costs and facilitating large-scale screening. Our model can even provide good predictions based on only a single time measurement. Consequently, the proposed method is an important means to reduce medical inequities caused by economic disparities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21467v1-abstract-full').style.display = 'none'; document.getElementById('2407.21467v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14340">arXiv:2407.14340</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.14340">pdf</a>, <a href="https://arxiv.org/format/2407.14340">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/CVPRW59228.2023.00135">10.1109/CVPRW59228.2023.00135 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Large Kernel Distillation Network for Efficient Single Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xie%2C+C">Chengxing Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xiaoming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Linze Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haiteng Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tianlin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+T">Tianrui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xiaole Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14340v1-abstract-short" style="display: inline;"> Efficient and lightweight single-image super-resolution (SISR) has achieved remarkable performance in recent years. One effective approach is the use of large kernel designs, which have been shown to improve the performance of SISR models while reducing their computational requirements. However, current state-of-the-art (SOTA) models still face problems such as high computational costs. To address&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14340v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14340v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14340v1-abstract-full" style="display: none;"> Efficient and lightweight single-image super-resolution (SISR) has achieved remarkable performance in recent years. One effective approach is the use of large kernel designs, which have been shown to improve the performance of SISR models while reducing their computational requirements. However, current state-of-the-art (SOTA) models still face problems such as high computational costs. To address these issues, we propose the Large Kernel Distillation Network (LKDN) in this paper. Our approach simplifies the model structure and introduces more efficient attention modules to reduce computational costs while also improving performance. Specifically, we employ the reparameterization technique to enhance model performance without adding extra cost. We also introduce a new optimizer from other tasks to SISR, which improves training speed and performance. Our experimental results demonstrate that LKDN outperforms existing lightweight SR methods and achieves SOTA performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14340v1-abstract-full').style.display = 'none'; document.getElementById('2407.14340v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR workshop 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13782">arXiv:2407.13782</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13782">pdf</a>, <a href="https://arxiv.org/format/2407.13782">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised ASR Models and Features For Dysarthric and Elderly Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guinan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+M">Mingyu Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13782v1-abstract-short" style="display: inline;"> Self-supervised learning (SSL) based speech foundation models have been applied to a wide range of ASR tasks. However, their application to dysarthric and elderly speech via data-intensive parameter fine-tuning is confronted by in-domain data scarcity and mismatch. To this end, this paper explores a series of approaches to integrate domain fine-tuned SSL pre-trained models and their features into&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13782v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13782v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13782v1-abstract-full" style="display: none;"> Self-supervised learning (SSL) based speech foundation models have been applied to a wide range of ASR tasks. However, their application to dysarthric and elderly speech via data-intensive parameter fine-tuning is confronted by in-domain data scarcity and mismatch. To this end, this paper explores a series of approaches to integrate domain fine-tuned SSL pre-trained models and their features into TDNN and Conformer ASR systems for dysarthric and elderly speech recognition. These include: a) input feature fusion between standard acoustic frontends and domain fine-tuned SSL speech representations; b) frame-level joint decoding between TDNN systems separately trained using standard acoustic features alone and those with additional domain fine-tuned SSL features; and c) multi-pass decoding involving the TDNN/Conformer system outputs to be rescored using domain fine-tuned pre-trained ASR models. In addition, fine-tuned SSL speech features are used in acoustic-to-articulatory (A2A) inversion to construct multi-modal ASR systems. Experiments are conducted on four tasks: the English UASpeech and TORGO dysarthric speech corpora; and the English DementiaBank Pitt and Cantonese JCCOCC MoCA elderly speech datasets. The TDNN systems constructed by integrating domain-adapted HuBERT, wav2vec2-conformer or multi-lingual XLSR models and their features consistently outperform the standalone fine-tuned SSL pre-trained models. These systems produced statistically significant WER or CER reductions of 6.53%, 1.90%, 2.04% and 7.97% absolute (24.10%, 23.84%, 10.14% and 31.39% relative) on the four tasks respectively. Consistent improvements in Alzheimer&#39;s Disease detection accuracy are also obtained using the DementiaBank Pitt elderly speech recognition outputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13782v1-abstract-full').style.display = 'none'; document.getElementById('2407.13782v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13509">arXiv:2407.13509</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.13509">pdf</a>, <a href="https://arxiv.org/format/2407.13509">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Spontaneous Style Text-to-Speech Synthesis with Controllable Spontaneous Behaviors Based on Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Weiqin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+P">Peiji Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+Y">Yicheng Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yixuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhisheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13509v1-abstract-short" style="display: inline;"> Spontaneous style speech synthesis, which aims to generate human-like speech, often encounters challenges due to the scarcity of high-quality data and limitations in model capabilities. Recent language model-based TTS systems can be trained on large, diverse, and low-quality speech datasets, resulting in highly natural synthesized speech. However, they are limited by the difficulty of simulating v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13509v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13509v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13509v1-abstract-full" style="display: none;"> Spontaneous style speech synthesis, which aims to generate human-like speech, often encounters challenges due to the scarcity of high-quality data and limitations in model capabilities. Recent language model-based TTS systems can be trained on large, diverse, and low-quality speech datasets, resulting in highly natural synthesized speech. However, they are limited by the difficulty of simulating various spontaneous behaviors and capturing prosody variations in spontaneous speech. In this paper, we propose a novel spontaneous speech synthesis system based on language models. We systematically categorize and uniformly model diverse spontaneous behaviors. Moreover, fine-grained prosody modeling is introduced to enhance the model&#39;s ability to capture subtle prosody variations in spontaneous speech.Experimental results show that our proposed method significantly outperforms the baseline methods in terms of prosody naturalness and spontaneous behavior naturalness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13509v1-abstract-full').style.display = 'none'; document.getElementById('2407.13509v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.10376">arXiv:2407.10376</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.10376">pdf</a>, <a href="https://arxiv.org/format/2407.10376">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model-based FMRI Encoding of Language Functions for Subjects with Neurocognitive Disorder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+X">Xianmin Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.10376v1-abstract-short" style="display: inline;"> Functional magnetic resonance imaging (fMRI) is essential for developing encoding models that identify functional changes in language-related brain areas of individuals with Neurocognitive Disorders (NCD). While large language model (LLM)-based fMRI encoding has shown promise, existing studies predominantly focus on healthy, young adults, overlooking older NCD populations and cognitive level corre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10376v1-abstract-full').style.display = 'inline'; document.getElementById('2407.10376v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.10376v1-abstract-full" style="display: none;"> Functional magnetic resonance imaging (fMRI) is essential for developing encoding models that identify functional changes in language-related brain areas of individuals with Neurocognitive Disorders (NCD). While large language model (LLM)-based fMRI encoding has shown promise, existing studies predominantly focus on healthy, young adults, overlooking older NCD populations and cognitive level correlations. This paper explores language-related functional changes in older NCD adults using LLM-based fMRI encoding and brain scores, addressing current limitations. We analyze the correlation between brain scores and cognitive scores at both whole-brain and language-related ROI levels. Our findings reveal that higher cognitive abilities correspond to better brain scores, with correlations peaking in the middle temporal gyrus. This study highlights the potential of fMRI encoding models and brain scores for detecting early functional changes in NCD patients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.10376v1-abstract-full').style.display = 'none'; document.getElementById('2407.10376v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.09817">arXiv:2407.09817</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.09817">pdf</a>, <a href="https://arxiv.org/format/2407.09817">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yuejiao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.09817v2-abstract-short" style="display: inline;"> Multi-talker speech recognition and target-talker speech recognition, both involve transcription in multi-talker contexts, remain significant challenges. However, existing methods rarely attempt to simultaneously address both tasks. In this study, we propose a pioneering approach to empower Whisper, which is a speech foundation model, to tackle joint multi-talker and target-talker speech recogniti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09817v2-abstract-full').style.display = 'inline'; document.getElementById('2407.09817v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.09817v2-abstract-full" style="display: none;"> Multi-talker speech recognition and target-talker speech recognition, both involve transcription in multi-talker contexts, remain significant challenges. However, existing methods rarely attempt to simultaneously address both tasks. In this study, we propose a pioneering approach to empower Whisper, which is a speech foundation model, to tackle joint multi-talker and target-talker speech recognition tasks. Specifically, (i) we freeze Whisper and plug a Sidecar separator into its encoder to separate mixed embedding for multiple talkers; (ii) a Target Talker Identifier is introduced to identify the embedding flow of the target talker on the fly, requiring only three-second enrollment speech as a cue; (iii) soft prompt tuning for decoder is explored for better task adaptation. Our method outperforms previous methods on two- and three-talker LibriMix and LibriSpeechMix datasets for both tasks, and delivers acceptable zero-shot performance on multi-talker ASR on AishellMix Mandarin dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.09817v2-abstract-full').style.display = 'none'; document.getElementById('2407.09817v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to INTERSPEECH 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08551">arXiv:2407.08551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.08551">pdf</a>, <a href="https://arxiv.org/format/2407.08551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Speech Synthesis without Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+L">Lingwei Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+L">Long Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shujie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+S">Sanyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+B">Bing Han</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yanqing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jinyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xixin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+F">Furu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08551v1-abstract-short" style="display: inline;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'inline'; document.getElementById('2407.08551v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08551v1-abstract-full" style="display: none;"> We present MELLE, a novel continuous-valued tokens based language modeling approach for text to speech synthesis (TTS). MELLE autoregressively generates continuous mel-spectrogram frames directly from text condition, bypassing the need for vector quantization, which are originally designed for audio compression and sacrifice fidelity compared to mel-spectrograms. Specifically, (i) instead of cross-entropy loss, we apply regression loss with a proposed spectrogram flux loss function to model the probability distribution of the continuous-valued tokens. (ii) we have incorporated variational inference into MELLE to facilitate sampling mechanisms, thereby enhancing the output diversity and model robustness. Experiments demonstrate that, compared to the two-stage codec language models VALL-E and its variants, the single-stage MELLE mitigates robustness issues by avoiding the inherent flaws of sampling discrete codes, achieves superior performance across multiple metrics, and, most importantly, offers a more streamlined paradigm. See https://aka.ms/melle for demos of our work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08551v1-abstract-full').style.display = 'none'; document.getElementById('2407.08551v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07810">arXiv:2407.07810</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07810">pdf</a>, <a href="https://arxiv.org/format/2407.07810">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Transformer Block Coupling and its Correlation with Generalization in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aubry%2C+M">Murdock Aubry</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haoming Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Sugolov%2C+A">Anton Sugolov</a>, <a href="/search/cs?searchtype=author&amp;query=Papyan%2C+V">Vardan Papyan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07810v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made significant strides in natural language processing, and a precise understanding of the internal mechanisms driving their success is essential. In this work, we trace the trajectories of individual tokens as they pass through transformer blocks, and linearize the system along these trajectories through their Jacobian matrices. By examining the relationships be&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07810v3-abstract-full').style.display = 'inline'; document.getElementById('2407.07810v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07810v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have made significant strides in natural language processing, and a precise understanding of the internal mechanisms driving their success is essential. In this work, we trace the trajectories of individual tokens as they pass through transformer blocks, and linearize the system along these trajectories through their Jacobian matrices. By examining the relationships between these Jacobians, we uncover a $\textbf{transformer block coupling}$ phenomenon in a variety of LLMs, characterized by the coupling of their top singular vectors across tokens and depth. Our findings reveal that coupling $\textit{positively correlates}$ with model performance, and that this relationship is stronger than with other hyperparameters, namely parameter budget, model depth, and embedding dimension. We further investigate the emergence of these properties through training, noting the development of coupling, as well as an increase in linearity and layer-wise exponential growth in the token trajectories. These collective insights provide a novel perspective on the interactions between token embeddings, and prompt further approaches to study training and generalization in LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07810v3-abstract-full').style.display = 'none'; document.getElementById('2407.07810v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07518">arXiv:2407.07518</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07518">pdf</a>, <a href="https://arxiv.org/format/2407.07518">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Crowd Counting via a Broker Modality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Haoliang Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+X">Xiaopeng Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chenhao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shang%2C+M">Miao Shang</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+W">Wangmeng Zuo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07518v1-abstract-short" style="display: inline;"> Multi-modal crowd counting involves estimating crowd density from both visual and thermal/depth images. This task is challenging due to the significant gap between these distinct modalities. In this paper, we propose a novel approach by introducing an auxiliary broker modality and on this basis frame the task as a triple-modal learning problem. We devise a fusion-based method to generate this brok&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07518v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07518v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07518v1-abstract-full" style="display: none;"> Multi-modal crowd counting involves estimating crowd density from both visual and thermal/depth images. This task is challenging due to the significant gap between these distinct modalities. In this paper, we propose a novel approach by introducing an auxiliary broker modality and on this basis frame the task as a triple-modal learning problem. We devise a fusion-based method to generate this broker modality, leveraging a non-diffusion, lightweight counterpart of modern denoising diffusion-based fusion models. Additionally, we identify and address the ghosting effect caused by direct cross-modal image fusion in multi-modal crowd counting. Through extensive experimental evaluations on popular multi-modal crowd-counting datasets, we demonstrate the effectiveness of our method, which introduces only 4 million additional parameters, yet achieves promising results. The code is available at https://github.com/HenryCilence/Broker-Modality-Crowd-Counting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07518v1-abstract-full').style.display = 'none'; document.getElementById('2407.07518v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is the preprint version of the paper and supplemental material to appear in ECCV 2024. Please cite the final published version. Code is available at https://github.com/HenryCilence/Broker-Modality-Crowd-Counting</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06310">arXiv:2407.06310</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.06310">pdf</a>, <a href="https://arxiv.org/format/2407.06310">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Homogeneous Speaker Features for On-the-Fly Dysarthric and Elderly Speaker Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Geng%2C+M">Mengzhe Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+X">Xurong Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jiajun Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Z">Zengrui Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+G">Guinan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tianzi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+S">Shujie Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoqing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xunying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06310v1-abstract-short" style="display: inline;"> The application of data-intensive automatic speech recognition (ASR) technologies to dysarthric and elderly adult speech is confronted by their mismatch against healthy and nonaged voices, data scarcity and large speaker-level variability. To this end, this paper proposes two novel data-efficient methods to learn homogeneous dysarthric and elderly speaker-level features for rapid, on-the-fly test-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06310v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06310v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06310v1-abstract-full" style="display: none;"> The application of data-intensive automatic speech recognition (ASR) technologies to dysarthric and elderly adult speech is confronted by their mismatch against healthy and nonaged voices, data scarcity and large speaker-level variability. To this end, this paper proposes two novel data-efficient methods to learn homogeneous dysarthric and elderly speaker-level features for rapid, on-the-fly test-time adaptation of DNN/TDNN and Conformer ASR models. These include: 1) speaker-level variance-regularized spectral basis embedding (VR-SBE) features that exploit a special regularization term to enforce homogeneity of speaker features in adaptation; and 2) feature-based learning hidden unit contributions (f-LHUC) transforms that are conditioned on VR-SBE features. Experiments are conducted on four tasks across two languages: the English UASpeech and TORGO dysarthric speech datasets, the English DementiaBank Pitt and Cantonese JCCOCC MoCA elderly speech corpora. The proposed on-the-fly speaker adaptation techniques consistently outperform baseline iVector and xVector adaptation by statistically significant word or character error rate reductions up to 5.32% absolute (18.57% relative) and batch-mode LHUC speaker adaptation by 2.24% absolute (9.20% relative), while operating with real-time factors speeding up to 33.6 times against xVectors during adaptation. The efficacy of the proposed adaptation techniques is demonstrated in a comparison against current ASR technologies including SSL pre-trained systems on UASpeech, where our best system produces a state-of-the-art WER of 23.33%. Analyses show VR-SBE features and f-LHUC transforms are insensitive to speaker-level data quantity in testtime adaptation. T-SNE visualization reveals they have stronger speaker-level homogeneity than baseline iVectors, xVectors and batch-mode LHUC transforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06310v1-abstract-full').style.display = 'none'; document.getElementById('2407.06310v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission to IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Meng%2C+H&amp;start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10