CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;34 of 34 results for author: <span class="mathjax">Hsu, P</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Hsu%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Hsu, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Hsu%2C+P&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Hsu, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10548">arXiv:2411.10548</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.10548">pdf</a>, <a href="https://arxiv.org/ps/2411.10548">ps</a>, <a href="https://arxiv.org/format/2411.10548">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> BioNeMo Framework: a modular, high-performance library for AI model development in drug discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=John%2C+P+S">Peter St. John</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+D">Dejun Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Binder%2C+P">Polina Binder</a>, <a href="/search/cs?searchtype=author&amp;query=Greaves%2C+M">Malcolm Greaves</a>, <a href="/search/cs?searchtype=author&amp;query=Shah%2C+V">Vega Shah</a>, <a href="/search/cs?searchtype=author&amp;query=John%2C+J+S">John St. John</a>, <a href="/search/cs?searchtype=author&amp;query=Lange%2C+A">Adrian Lange</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Patrick Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Illango%2C+R">Rajesh Illango</a>, <a href="/search/cs?searchtype=author&amp;query=Ramanathan%2C+A">Arvind Ramanathan</a>, <a href="/search/cs?searchtype=author&amp;query=Anandkumar%2C+A">Anima Anandkumar</a>, <a href="/search/cs?searchtype=author&amp;query=Brookes%2C+D+H">David H Brookes</a>, <a href="/search/cs?searchtype=author&amp;query=Busia%2C+A">Akosua Busia</a>, <a href="/search/cs?searchtype=author&amp;query=Mahajan%2C+A">Abhishaike Mahajan</a>, <a href="/search/cs?searchtype=author&amp;query=Malina%2C+S">Stephen Malina</a>, <a href="/search/cs?searchtype=author&amp;query=Prasad%2C+N">Neha Prasad</a>, <a href="/search/cs?searchtype=author&amp;query=Sinai%2C+S">Sam Sinai</a>, <a href="/search/cs?searchtype=author&amp;query=Edwards%2C+L">Lindsay Edwards</a>, <a href="/search/cs?searchtype=author&amp;query=Gaudelet%2C+T">Thomas Gaudelet</a>, <a href="/search/cs?searchtype=author&amp;query=Regep%2C+C">Cristian Regep</a>, <a href="/search/cs?searchtype=author&amp;query=Steinegger%2C+M">Martin Steinegger</a>, <a href="/search/cs?searchtype=author&amp;query=Rost%2C+B">Burkhard Rost</a>, <a href="/search/cs?searchtype=author&amp;query=Brace%2C+A">Alexander Brace</a>, <a href="/search/cs?searchtype=author&amp;query=Hippe%2C+K">Kyle Hippe</a>, <a href="/search/cs?searchtype=author&amp;query=Naef%2C+L">Luca Naef</a> , et al. (63 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10548v1-abstract-short" style="display: inline;"> Artificial Intelligence models encoding biology and chemistry are opening new routes to high-throughput and high-quality in-silico drug development. However, their training increasingly relies on computational scale, with recent protein language models (pLM) training on hundreds of graphical processing units (GPUs). We introduce the BioNeMo Framework to facilitate the training of computational bio&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10548v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10548v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10548v1-abstract-full" style="display: none;"> Artificial Intelligence models encoding biology and chemistry are opening new routes to high-throughput and high-quality in-silico drug development. However, their training increasingly relies on computational scale, with recent protein language models (pLM) training on hundreds of graphical processing units (GPUs). We introduce the BioNeMo Framework to facilitate the training of computational biology and chemistry AI models across hundreds of GPUs. Its modular design allows the integration of individual components, such as data loaders, into existing workflows and is open to community contributions. We detail technical features of the BioNeMo Framework through use cases such as pLM pre-training and fine-tuning. On 256 NVIDIA A100s, BioNeMo Framework trains a three billion parameter BERT-based pLM on over one trillion tokens in 4.2 days. The BioNeMo Framework is open-source and free for everyone to use. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10548v1-abstract-full').style.display = 'none'; document.getElementById('2411.10548v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07111">arXiv:2411.07111</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.07111">pdf</a>, <a href="https://arxiv.org/format/2411.07111">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building a Taiwanese Mandarin Spoken Language Model: A First Attempt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yu-Kuan Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/cs?searchtype=author&amp;query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Wei-Ping Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+T">Tzu-Quan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hsiu-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+E">En-Pei Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Tseng%2C+L">Liang-Hsuan Tseng</a>, <a href="/search/cs?searchtype=author&amp;query=Chiu%2C+I">I-Hsiang Chiu</a>, <a href="/search/cs?searchtype=author&amp;query=Sanga%2C+U">Ulin Sanga</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07111v1-abstract-short" style="display: inline;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07111v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07111v1-abstract-full" style="display: none;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex capabilities allowing simultaneous speaking and listening. The paper also details the training process, including data preparation with synthesized dialogues and adjustments for real-time interaction. We also developed a platform to evaluate conversational fluency and response coherence in multi-turn dialogues. We hope the release of the report can contribute to the future development of spoken LLMs in Taiwanese Mandarin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'none'; document.getElementById('2411.07111v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10989">arXiv:2410.10989</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.10989">pdf</a>, <a href="https://arxiv.org/format/2410.10989">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Liger Kernel: Efficient Triton Kernels for LLM Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Pin-Lun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Dai%2C+Y">Yun Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Kothapalli%2C+V">Vignesh Kothapalli</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+Q">Qingquan Song</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+S">Shao Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Shimizu%2C+S">Steven Shimizu</a>, <a href="/search/cs?searchtype=author&amp;query=Sahni%2C+S">Shivam Sahni</a>, <a href="/search/cs?searchtype=author&amp;query=Ning%2C+H">Haowen Ning</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yanning Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10989v2-abstract-short" style="display: inline;"> Training Large Language Models (LLMs) efficiently at scale presents a formidable challenge, driven by their ever-increasing computational demands and the need for enhanced performance. In this work, we introduce Liger-Kernel, an open-sourced set of Triton kernels developed specifically for LLM training. With kernel optimization techniques like kernel operation fusing and input chunking, our kernel&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10989v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10989v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10989v2-abstract-full" style="display: none;"> Training Large Language Models (LLMs) efficiently at scale presents a formidable challenge, driven by their ever-increasing computational demands and the need for enhanced performance. In this work, we introduce Liger-Kernel, an open-sourced set of Triton kernels developed specifically for LLM training. With kernel optimization techniques like kernel operation fusing and input chunking, our kernels achieve on average a 20% increase in training throughput and a 60% reduction in GPU memory usage for popular LLMs compared to HuggingFace implementations. In addition, Liger-Kernel is designed with modularity, accessibility, and adaptability in mind, catering to both casual and expert users. Comprehensive benchmarks and integration tests are built in to ensure compatibility, performance, correctness, and convergence across diverse computing environments and model architectures. The source code is available under a permissive license at: github.com/linkedin/Liger-Kernel. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10989v2-abstract-full').style.display = 'none'; document.getElementById('2410.10989v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12558">arXiv:2409.12558</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12558">pdf</a>, <a href="https://arxiv.org/format/2409.12558">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> RAD-Bench: Evaluating Large Language Models Capabilities in Retrieval Augmented Dialogues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kuo%2C+T">Tzu-Lin Kuo</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+F">Feng-Ting Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Hsieh%2C+M">Mu-Wei Hsieh</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+F">Fu-Chieh Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Shiu%2C+D">Da-Shan Shiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12558v1-abstract-short" style="display: inline;"> In real-world applications with Large Language Models (LLMs), external retrieval mechanisms - such as Search-Augmented Generation (SAG), tool utilization, and Retrieval-Augmented Generation (RAG) - are often employed to enhance the quality of augmented generations in dialogues. These approaches often come with multi-turn dialogue, where each interaction is enriched by relevant information retrieve&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12558v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12558v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12558v1-abstract-full" style="display: none;"> In real-world applications with Large Language Models (LLMs), external retrieval mechanisms - such as Search-Augmented Generation (SAG), tool utilization, and Retrieval-Augmented Generation (RAG) - are often employed to enhance the quality of augmented generations in dialogues. These approaches often come with multi-turn dialogue, where each interaction is enriched by relevant information retrieved from external sources. Existing benchmarks either assess LLMs&#39; chat abilities in multi-turn dialogues or their use of retrieval for augmented responses in single-turn settings. However, there is a gap in evaluating LLMs&#39; ability to leverage retrieval for more precise responses across multiple turns. To address this limitation, we introduce RAD-Bench (Retrieval Augmented Dialogue), a benchmark designed to evaluate LLMs&#39; capabilities in multi-turn dialogues following retrievals, essential for their deployment in context-rich applications. RAD-Bench evaluates two key abilities of LLMs: Retrieval Synthesis and Retrieval Reasoning. These are measured using discriminative questions and retrieved contexts, and corresponding reference answers, assessing how effectively LLMs integrate and reason with context to maintain and enhance conversation quality over multiple turns. Our evaluation results on commonly used LLMs reveal that model performance deteriorates as additional layers of conditions or constraints are applied across conversation turns, even when accurate retrieved contexts are provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12558v1-abstract-full').style.display = 'none'; document.getElementById('2409.12558v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11654">arXiv:2409.11654</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.11654">pdf</a>, <a href="https://arxiv.org/format/2409.11654">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> How to Build the Virtual Cell with Artificial Intelligence: Priorities and Opportunities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bunne%2C+C">Charlotte Bunne</a>, <a href="/search/cs?searchtype=author&amp;query=Roohani%2C+Y">Yusuf Roohani</a>, <a href="/search/cs?searchtype=author&amp;query=Rosen%2C+Y">Yanay Rosen</a>, <a href="/search/cs?searchtype=author&amp;query=Gupta%2C+A">Ankit Gupta</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xikun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Roed%2C+M">Marcel Roed</a>, <a href="/search/cs?searchtype=author&amp;query=Alexandrov%2C+T">Theo Alexandrov</a>, <a href="/search/cs?searchtype=author&amp;query=AlQuraishi%2C+M">Mohammed AlQuraishi</a>, <a href="/search/cs?searchtype=author&amp;query=Brennan%2C+P">Patricia Brennan</a>, <a href="/search/cs?searchtype=author&amp;query=Burkhardt%2C+D+B">Daniel B. Burkhardt</a>, <a href="/search/cs?searchtype=author&amp;query=Califano%2C+A">Andrea Califano</a>, <a href="/search/cs?searchtype=author&amp;query=Cool%2C+J">Jonah Cool</a>, <a href="/search/cs?searchtype=author&amp;query=Dernburg%2C+A+F">Abby F. Dernburg</a>, <a href="/search/cs?searchtype=author&amp;query=Ewing%2C+K">Kirsty Ewing</a>, <a href="/search/cs?searchtype=author&amp;query=Fox%2C+E+B">Emily B. Fox</a>, <a href="/search/cs?searchtype=author&amp;query=Haury%2C+M">Matthias Haury</a>, <a href="/search/cs?searchtype=author&amp;query=Herr%2C+A+E">Amy E. Herr</a>, <a href="/search/cs?searchtype=author&amp;query=Horvitz%2C+E">Eric Horvitz</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P+D">Patrick D. Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+V">Viren Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Johnson%2C+G+R">Gregory R. Johnson</a>, <a href="/search/cs?searchtype=author&amp;query=Kalil%2C+T">Thomas Kalil</a>, <a href="/search/cs?searchtype=author&amp;query=Kelley%2C+D+R">David R. Kelley</a>, <a href="/search/cs?searchtype=author&amp;query=Kelley%2C+S+O">Shana O. Kelley</a>, <a href="/search/cs?searchtype=author&amp;query=Kreshuk%2C+A">Anna Kreshuk</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11654v2-abstract-short" style="display: inline;"> The cell is arguably the most fundamental unit of life and is central to understanding biology. Accurate modeling of cells is important for this understanding as well as for determining the root causes of disease. Recent advances in artificial intelligence (AI), combined with the ability to generate large-scale experimental data, present novel opportunities to model cells. Here we propose a vision&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11654v2-abstract-full').style.display = 'inline'; document.getElementById('2409.11654v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11654v2-abstract-full" style="display: none;"> The cell is arguably the most fundamental unit of life and is central to understanding biology. Accurate modeling of cells is important for this understanding as well as for determining the root causes of disease. Recent advances in artificial intelligence (AI), combined with the ability to generate large-scale experimental data, present novel opportunities to model cells. Here we propose a vision of leveraging advances in AI to construct virtual cells, high-fidelity simulations of cells and cellular systems under different conditions that are directly learned from biological data across measurements and scales. We discuss desired capabilities of such AI Virtual Cells, including generating universal representations of biological entities across scales, and facilitating interpretable in silico experiments to predict and understand their behavior using virtual instruments. We further address the challenges, opportunities and requirements to realize this vision including data needs, evaluation strategies, and community standards and engagement to ensure biological accuracy and broad utility. We envision a future where AI Virtual Cells help identify new drug targets, predict cellular responses to perturbations, as well as scale hypothesis exploration. With open science collaborations across the biomedical ecosystem that includes academia, philanthropy, and the biopharma and AI industries, a comprehensive predictive understanding of cell mechanisms and interactions has come into reach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11654v2-abstract-full').style.display = 'none'; document.getElementById('2409.11654v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09090">arXiv:2409.09090</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.09090">pdf</a>, <a href="https://arxiv.org/format/2409.09090">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> An Evaluation of GPT-4V for Transcribing the Urban Renewal Hand-Written Collection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+M">Myeong Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+J+H+P">Julia H. P. Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09090v1-abstract-short" style="display: inline;"> Between 1960 and 1980, urban renewal transformed many cities, creating vast handwritten records. These documents posed a significant challenge for researchers due to their volume and handwritten nature. The launch of GPT-4V in November 2023 offered a breakthrough, enabling large-scale, efficient transcription and analysis of these historical urban renewal documents. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09090v1-abstract-full" style="display: none;"> Between 1960 and 1980, urban renewal transformed many cities, creating vast handwritten records. These documents posed a significant challenge for researchers due to their volume and handwritten nature. The launch of GPT-4V in November 2023 offered a breakthrough, enabling large-scale, efficient transcription and analysis of these historical urban renewal documents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09090v1-abstract-full').style.display = 'none'; document.getElementById('2409.09090v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Digital Humanities (DH 2024). Aug 6-9. Arlington, VA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15440">arXiv:2407.15440</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.15440">pdf</a>, <a href="https://arxiv.org/format/2407.15440">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> The Bicameral Cache: a split cache for vector architectures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Rebolledo%2C+S">Susana Rebolledo</a>, <a href="/search/cs?searchtype=author&amp;query=Perez%2C+B">Borja Perez</a>, <a href="/search/cs?searchtype=author&amp;query=Bosque%2C+J+L">Jose Luis Bosque</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Peter Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15440v3-abstract-short" style="display: inline;"> The Bicameral Cache is a cache organization proposal for a vector architecture that segregates data according to their access type, distinguishing scalar from vector references. Its aim is to avoid both types of references from interfering in each other&#39;s data locality, with a special focus on prioritizing the performance on vector references. The proposed system incorporates an additional, non-po&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15440v3-abstract-full').style.display = 'inline'; document.getElementById('2407.15440v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15440v3-abstract-full" style="display: none;"> The Bicameral Cache is a cache organization proposal for a vector architecture that segregates data according to their access type, distinguishing scalar from vector references. Its aim is to avoid both types of references from interfering in each other&#39;s data locality, with a special focus on prioritizing the performance on vector references. The proposed system incorporates an additional, non-polluting prefetching mechanism to help populate the long vector cache lines in advance to increase the hit rate by further exploiting the spatial locality on vector data. Its evaluation was conducted on the Cavatools simulator, comparing the performance to a standard conventional cache, over different typical vector benchmarks for several vector lengths. The results proved the proposed cache speeds up performance on stride-1 vector benchmarks, while hardly impacting non-stride-1&#39;s. In addition, the prefetching feature consistently provided an additional value. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15440v3-abstract-full').style.display = 'none'; document.getElementById('2407.15440v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.14259">arXiv:2405.14259</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.14259">pdf</a>, <a href="https://arxiv.org/format/2405.14259">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Let&#39;s Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Multi-modal Text Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi-Chang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+F">Feng-Ting Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Ho%2C+P">Pei-Chen Ho</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yu-Hsiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Shiu%2C+D">Da-shan Shiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.14259v3-abstract-short" style="display: inline;"> We introduce &#34;Generative Fusion Decoding&#34; (GFD), a novel shallow fusion framework, utilized to integrate Large Language Models (LLMs) into multi-modal text recognition systems such as automatic speech recognition (ASR) and optical character recognition (OCR). We derive the formulas necessary to enable GFD to operate across mismatched token spaces of different models by mapping text token space to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14259v3-abstract-full').style.display = 'inline'; document.getElementById('2405.14259v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.14259v3-abstract-full" style="display: none;"> We introduce &#34;Generative Fusion Decoding&#34; (GFD), a novel shallow fusion framework, utilized to integrate Large Language Models (LLMs) into multi-modal text recognition systems such as automatic speech recognition (ASR) and optical character recognition (OCR). We derive the formulas necessary to enable GFD to operate across mismatched token spaces of different models by mapping text token space to byte token space, enabling seamless fusion during the decoding process. The framework is plug-and-play, compatible with various auto-regressive models, and does not require re-training for feature alignment, thus overcoming limitations of previous fusion techniques. We highlight three main advantages of GFD: First, by simplifying the complexity of aligning different model sample spaces, GFD allows LLMs to correct errors in tandem with the recognition model, reducing computation latencies. Second, the in-context learning ability of LLMs is fully capitalized by GFD, increasing robustness in long-form speech recognition and instruction aware speech recognition. Third, GFD enables fusing recognition models deficient in Chinese text recognition with LLMs extensively trained on Chinese. Our evaluation demonstrates that GFD significantly improves performance in ASR and OCR tasks, with ASR reaching state-of-the-art in the NTUML2021 benchmark. GFD provides a significant step forward in model integration, offering a unified solution that could be widely applicable to leveraging existing pre-trained models through step by step fusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.14259v3-abstract-full').style.display = 'none'; document.getElementById('2405.14259v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.14135">arXiv:2404.14135</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.14135">pdf</a>, <a href="https://arxiv.org/format/2404.14135">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Text in the Dark: Extremely Low-Light Text Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Che-Tsung Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Ng%2C+C+C">Chun Chet Ng</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Z+Q">Zhi Qin Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Nah%2C+W+J">Wan Jun Nah</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Kew%2C+J+L">Jie Long Kew</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Pohao Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+S+H">Shang Hong Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Chan%2C+C+S">Chee Seng Chan</a>, <a href="/search/cs?searchtype=author&amp;query=Zach%2C+C">Christopher Zach</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.14135v1-abstract-short" style="display: inline;"> Extremely low-light text images are common in natural scenes, making scene text detection and recognition challenging. One solution is to enhance these images using low-light image enhancement methods before text extraction. However, previous methods often do not try to particularly address the significance of low-level features, which are crucial for optimal performance on downstream scene text t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14135v1-abstract-full').style.display = 'inline'; document.getElementById('2404.14135v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.14135v1-abstract-full" style="display: none;"> Extremely low-light text images are common in natural scenes, making scene text detection and recognition challenging. One solution is to enhance these images using low-light image enhancement methods before text extraction. However, previous methods often do not try to particularly address the significance of low-level features, which are crucial for optimal performance on downstream scene text tasks. Further research is also hindered by the lack of extremely low-light text datasets. To address these limitations, we propose a novel encoder-decoder framework with an edge-aware attention module to focus on scene text regions during enhancement. Our proposed method uses novel text detection and edge reconstruction losses to emphasize low-level scene text features, leading to successful text extraction. Additionally, we present a Supervised Deep Curve Estimation (Supervised-DCE) model to synthesize extremely low-light images based on publicly available scene text datasets such as ICDAR15 (IC15). We also labeled texts in the extremely low-light See In the Dark (SID) and ordinary LOw-Light (LOL) datasets to allow for objective assessment of extremely low-light image enhancement through scene text tasks. Extensive experiments show that our model outperforms state-of-the-art methods in terms of both image quality and scene text metrics on the widely-used LOL, SID, and synthetic IC15 datasets. Code and dataset will be released publicly at https://github.com/chunchet-ng/Text-in-the-Dark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.14135v1-abstract-full').style.display = 'none'; document.getElementById('2404.14135v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02712">arXiv:2403.02712</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.02712">pdf</a>, <a href="https://arxiv.org/format/2403.02712">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Breeze-7B Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chang-Le Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+F">Feng-Ting Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi-Chang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shiu%2C+D">Da-Shan Shiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02712v2-abstract-short" style="display: inline;"> Breeze-7B is an open-source language model based on Mistral-7B, designed to address the need for improved language comprehension and chatbot-oriented capabilities in Traditional Chinese. This technical report provides an overview of the additional pretraining, finetuning, and evaluation stages for the Breeze-7B model. The Breeze-7B family of base and chat models exhibits good performance on langua&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02712v2-abstract-full').style.display = 'inline'; document.getElementById('2403.02712v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02712v2-abstract-full" style="display: none;"> Breeze-7B is an open-source language model based on Mistral-7B, designed to address the need for improved language comprehension and chatbot-oriented capabilities in Traditional Chinese. This technical report provides an overview of the additional pretraining, finetuning, and evaluation stages for the Breeze-7B model. The Breeze-7B family of base and chat models exhibits good performance on language comprehension and chatbot-oriented tasks, reaching the top in several benchmarks among models comparable in its complexity class. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02712v2-abstract-full').style.display = 'none'; document.getElementById('2403.02712v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.04257">arXiv:2312.04257</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.04257">pdf</a>, <a href="https://arxiv.org/format/2312.04257">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Proxima: Near-storage Acceleration for Graph-based Approximate Nearest Neighbor Search in 3D NAND </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+W">Weihong Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junwei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Kai Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jaeyoung Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+M">Minxuan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Pinge%2C+S">Sumukh Pinge</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shimeng Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Rosing%2C+T">Tajana Rosing</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.04257v1-abstract-short" style="display: inline;"> Approximate nearest neighbor search (ANNS) plays an indispensable role in a wide variety of applications, including recommendation systems, information retrieval, and semantic search. Among the cutting-edge ANNS algorithms, graph-based approaches provide superior accuracy and scalability on massive datasets. However, the best-performing graph-based ANN search solutions incur tens of hundreds of me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04257v1-abstract-full').style.display = 'inline'; document.getElementById('2312.04257v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.04257v1-abstract-full" style="display: none;"> Approximate nearest neighbor search (ANNS) plays an indispensable role in a wide variety of applications, including recommendation systems, information retrieval, and semantic search. Among the cutting-edge ANNS algorithms, graph-based approaches provide superior accuracy and scalability on massive datasets. However, the best-performing graph-based ANN search solutions incur tens of hundreds of memory footprints as well as costly distance computation, thus hindering their efficient deployment at scale. The 3D NAND flash is emerging as a promising device for data-intensive applications due to its high density and nonvolatility. In this work, we present the near-storage processing (NSP)-based ANNS solution Proxima, to accelerate graph-based ANNS with algorithm-hardware co-design in 3D NAND flash. Proxima significantly reduces the complexity of graph search by leveraging the distance approximation and early termination. On top of the algorithmic enhancement, we implement Proxima search algorithm in 3D NAND flash using the heterogeneous integration technique. To maximize 3D NAND&#39;s bandwidth utilization, we present customized dataflow and optimized data allocation scheme. Our evaluation results show that: compared to graph ANNS on CPU and GPU, Proxima achieves a magnitude improvement in throughput or energy efficiency. Proxima yields 7x to 13x speedup over existing ASIC designs. Furthermore, Proxima achieves a good balance between accuracy, efficiency and storage density compared to previous NSP-based accelerators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.04257v1-abstract-full').style.display = 'none'; document.getElementById('2312.04257v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.17020">arXiv:2309.17020</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.17020">pdf</a>, <a href="https://arxiv.org/format/2309.17020">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Low-Resource Self-Supervised Learning with SSL-Enhanced TTS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Elkahky%2C+A">Ali Elkahky</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+W">Wei-Ning Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Adi%2C+Y">Yossi Adi</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+A">Tu Anh Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Copet%2C+J">Jade Copet</a>, <a href="/search/cs?searchtype=author&amp;query=Dupoux%2C+E">Emmanuel Dupoux</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Mohamed%2C+A">Abdelrahman Mohamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.17020v2-abstract-short" style="display: inline;"> Self-supervised learning (SSL) techniques have achieved remarkable results in various speech processing tasks. Nonetheless, a significant challenge remains in reducing the reliance on vast amounts of speech data for pre-training. This paper proposes to address this challenge by leveraging synthetic speech to augment a low-resource pre-training corpus. We construct a high-quality text-to-speech (TT&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.17020v2-abstract-full').style.display = 'inline'; document.getElementById('2309.17020v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.17020v2-abstract-full" style="display: none;"> Self-supervised learning (SSL) techniques have achieved remarkable results in various speech processing tasks. Nonetheless, a significant challenge remains in reducing the reliance on vast amounts of speech data for pre-training. This paper proposes to address this challenge by leveraging synthetic speech to augment a low-resource pre-training corpus. We construct a high-quality text-to-speech (TTS) system with limited resources using SSL features and generate a large synthetic corpus for pre-training. Experimental results demonstrate that our proposed approach effectively reduces the demand for speech data by 90% with only slight performance degradation. To the best of our knowledge, this is the first work aiming to enhance low-resource self-supervised learning in speech processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.17020v2-abstract-full').style.display = 'none'; document.getElementById('2309.17020v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ASRU 2023 SPARKS Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.08448">arXiv:2309.08448</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.08448">pdf</a>, <a href="https://arxiv.org/format/2309.08448">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Advancing the Evaluation of Traditional Chinese Language Models: Towards a Comprehensive Benchmark Suite </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chang-Le Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+F">Feng-Ting Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi-Chang Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Shiu%2C+D">Da-shan Shiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.08448v2-abstract-short" style="display: inline;"> The evaluation of large language models is an essential task in the field of language understanding and generation. As language models continue to advance, the need for effective benchmarks to assess their performance has become imperative. In the context of Traditional Chinese, there is a scarcity of comprehensive and diverse benchmarks to evaluate the capabilities of language models, despite the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08448v2-abstract-full').style.display = 'inline'; document.getElementById('2309.08448v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.08448v2-abstract-full" style="display: none;"> The evaluation of large language models is an essential task in the field of language understanding and generation. As language models continue to advance, the need for effective benchmarks to assess their performance has become imperative. In the context of Traditional Chinese, there is a scarcity of comprehensive and diverse benchmarks to evaluate the capabilities of language models, despite the existence of certain benchmarks such as DRCD, TTQA, CMDQA, and FGC dataset. To address this gap, we propose a novel set of benchmarks that leverage existing English datasets and are tailored to evaluate language models in Traditional Chinese. These benchmarks encompass a wide range of tasks, including contextual question-answering, summarization, classification, and table understanding. The proposed benchmarks offer a comprehensive evaluation framework, enabling the assessment of language models&#39; capabilities across different tasks. In this paper, we evaluate the performance of GPT-3.5, Taiwan-LLaMa-v1.0, and Model 7-C, our proprietary model, on these benchmarks. The evaluation results highlight that our model, Model 7-C, achieves performance comparable to GPT-3.5 with respect to a part of the evaluated capabilities. In an effort to advance the evaluation of language models in Traditional Chinese and stimulate further research in this field, we have open-sourced our benchmark and opened the model for trial. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.08448v2-abstract-full').style.display = 'none'; document.getElementById('2309.08448v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.03942">arXiv:2306.03942</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2306.03942">pdf</a>, <a href="https://arxiv.org/format/2306.03942">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> NFT.mine: An xDeepFM-based Recommender System for Non-fungible Token (NFT) Buyers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shuwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+Y">Yucheng Jin</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Pin-Lun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Ya-Sin Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.03942v1-abstract-short" style="display: inline;"> Non-fungible token (NFT) is a tradable unit of data stored on the blockchain which can be associated with some digital asset as a certification of ownership. The past several years have witnessed the exponential growth of the NFT market. In 2021, the NFT market reached its peak with more than $40 billion trades. Despite the booming NFT market, most NFT-related studies focus on its technical aspect&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03942v1-abstract-full').style.display = 'inline'; document.getElementById('2306.03942v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.03942v1-abstract-full" style="display: none;"> Non-fungible token (NFT) is a tradable unit of data stored on the blockchain which can be associated with some digital asset as a certification of ownership. The past several years have witnessed the exponential growth of the NFT market. In 2021, the NFT market reached its peak with more than $40 billion trades. Despite the booming NFT market, most NFT-related studies focus on its technical aspect, such as standards, protocols, and security, while our study aims at developing a pioneering recommender system for NFT buyers. In this paper, we introduce an extreme deep factorization machine (xDeepFM)-based recommender system, NFT.mine, which achieves real-time data collection, data cleaning, feature extraction, training, and inference. We used data from OpenSea, the most influential NFT trading platform, to testify the performance of NFT.mine. As a result, experiments showed that compared to traditional models such as logistic regression, naive Bayes, random forest, etc., NFT.mine outperforms them with higher AUC and lower cross entropy loss and outputs personalized recommendations for NFT buyers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.03942v1-abstract-full').style.display = 'none'; document.getElementById('2306.03942v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 8 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.13109">arXiv:2304.13109</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2304.13109">pdf</a>, <a href="https://arxiv.org/ps/2304.13109">ps</a>, <a href="https://arxiv.org/format/2304.13109">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/VTC2022-Fall57202.2022.10012887">10.1109/VTC2022-Fall57202.2022.10012887 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Federated Deep Reinforcement Learning for THz-Beam Search with Limited CSI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+L">Li-Hsiang Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chun-Hung Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Feng%2C+K">Kai-Ten Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.13109v1-abstract-short" style="display: inline;"> Terahertz (THz) communication with ultra-wide available spectrum is a promising technique that can achieve the stringent requirement of high data rate in the next-generation wireless networks, yet its severe propagation attenuation significantly hinders its implementation in practice. Finding beam directions for a large-scale antenna array to effectively overcome severe propagation attenuation of&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13109v1-abstract-full').style.display = 'inline'; document.getElementById('2304.13109v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.13109v1-abstract-full" style="display: none;"> Terahertz (THz) communication with ultra-wide available spectrum is a promising technique that can achieve the stringent requirement of high data rate in the next-generation wireless networks, yet its severe propagation attenuation significantly hinders its implementation in practice. Finding beam directions for a large-scale antenna array to effectively overcome severe propagation attenuation of THz signals is a pressing need. This paper proposes a novel approach of federated deep reinforcement learning (FDRL) to swiftly perform THz-beam search for multiple base stations (BSs) coordinated by an edge server in a cellular network. All the BSs conduct deep deterministic policy gradient (DDPG)-based DRL to obtain THz beamforming policy with limited channel state information (CSI). They update their DDPG models with hidden information in order to mitigate inter-cell interference. We demonstrate that the cell network can achieve higher throughput as more THz CSI and hidden neurons of DDPG are adopted. We also show that FDRL with partial model update is able to nearly achieve the same performance of FDRL with full model update, which indicates an effective means to reduce communication load between the edge server and the BSs by partial model uploading. Moreover, the proposed FDRL outperforms conventional non-learning-based and existing non-FDRL benchmark optimization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13109v1-abstract-full').style.display = 'none'; document.getElementById('2304.13109v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Vehicular Technology Conference (VTC-Fall), 2022 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.04715">arXiv:2303.04715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.04715">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Extending the Pre-Training of BLOOM for Improved Support of Traditional Chinese: Models, Methods and Results </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ennen%2C+P">Philipp Ennen</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Chang-Le Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yen-Chen Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+Y">Yin-Hsiang Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Chin-Tung Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Shiu%2C+D">Da-Shan Shiu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wei-Yun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.04715v2-abstract-short" style="display: inline;"> In this paper we present the multilingual language model BLOOM-zh that features enhanced support for Traditional Chinese. BLOOM-zh has its origins in the open-source BLOOM models presented by BigScience in 2022. Starting from released models, we extended the pre-training of BLOOM by additional 7.4 billion tokens in Traditional Chinese and English covering a variety of domains such as news articles&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.04715v2-abstract-full').style.display = 'inline'; document.getElementById('2303.04715v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.04715v2-abstract-full" style="display: none;"> In this paper we present the multilingual language model BLOOM-zh that features enhanced support for Traditional Chinese. BLOOM-zh has its origins in the open-source BLOOM models presented by BigScience in 2022. Starting from released models, we extended the pre-training of BLOOM by additional 7.4 billion tokens in Traditional Chinese and English covering a variety of domains such as news articles, books, encyclopedias, educational materials as well as spoken language. In order to show the properties of BLOOM-zh, both existing and newly created benchmark scenarios are used for evaluating the performance. BLOOM-zh outperforms its predecessor on most Traditional Chinese benchmarks while maintaining its English capability. We release all our models to the research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.04715v2-abstract-full').style.display = 'none'; document.getElementById('2303.04715v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.14568">arXiv:2207.14568</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.14568">pdf</a>, <a href="https://arxiv.org/format/2207.14568">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Learning Phone Recognition from Unpaired Audio and Phone Sequences Based on Generative Adversarial Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Da-rong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi-chen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Sung-feng Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Chuang%2C+S">Shun-po Chuang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+D">Da-yi Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.14568v1-abstract-short" style="display: inline;"> ASR has been shown to achieve great performance recently. However, most of them rely on massive paired data, which is not feasible for low-resource languages worldwide. This paper investigates how to learn directly from unpaired phone sequences and speech utterances. We design a two-stage iterative framework. GAN training is adopted in the first stage to find the mapping relationship between unpai&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14568v1-abstract-full').style.display = 'inline'; document.getElementById('2207.14568v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.14568v1-abstract-full" style="display: none;"> ASR has been shown to achieve great performance recently. However, most of them rely on massive paired data, which is not feasible for low-resource languages worldwide. This paper investigates how to learn directly from unpaired phone sequences and speech utterances. We design a two-stage iterative framework. GAN training is adopted in the first stage to find the mapping relationship between unpaired speech and phone sequence. In the second stage, another HMM model is introduced to train from the generator&#39;s output, which boosts the performance and provides a better segmentation for the next iteration. In the experiment, we first investigate different choices of model designs. Then we compare the framework to different types of baselines: (i) supervised methods (ii) acoustic unit discovery based methods (iii) methods learning from unpaired data. Our framework performs consistently better than all acoustic unit discovery methods and previous methods learning from unpaired data based on the TIMIT dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.14568v1-abstract-full').style.display = 'none'; document.getElementById('2207.14568v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.10643">arXiv:2207.10643</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2207.10643">pdf</a>, <a href="https://arxiv.org/format/2207.10643">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> STOP: A dataset for Spoken Task Oriented Semantic Parsing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tomasello%2C+P">Paden Tomasello</a>, <a href="/search/cs?searchtype=author&amp;query=Shrivastava%2C+A">Akshat Shrivastava</a>, <a href="/search/cs?searchtype=author&amp;query=Lazar%2C+D">Daniel Lazar</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+D">Duc Le</a>, <a href="/search/cs?searchtype=author&amp;query=Sagar%2C+A">Adithya Sagar</a>, <a href="/search/cs?searchtype=author&amp;query=Elkahky%2C+A">Ali Elkahky</a>, <a href="/search/cs?searchtype=author&amp;query=Copet%2C+J">Jade Copet</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+W">Wei-Ning Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Adi%2C+Y">Yossi Adi</a>, <a href="/search/cs?searchtype=author&amp;query=Algayres%2C+R">Robin Algayres</a>, <a href="/search/cs?searchtype=author&amp;query=Nguyen%2C+T+A">Tu Ahn Nguyen</a>, <a href="/search/cs?searchtype=author&amp;query=Dupoux%2C+E">Emmanuel Dupoux</a>, <a href="/search/cs?searchtype=author&amp;query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&amp;query=Mohamed%2C+A">Abdelrahman Mohamed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.10643v3-abstract-short" style="display: inline;"> End-to-end spoken language understanding (SLU) predicts intent directly from audio using a single model. It promises to improve the performance of assistant systems by leveraging acoustic information lost in the intermediate textual representation and preventing cascading errors from Automatic Speech Recognition (ASR). Further, having one unified model has efficiency advantages when deploying assi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10643v3-abstract-full').style.display = 'inline'; document.getElementById('2207.10643v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.10643v3-abstract-full" style="display: none;"> End-to-end spoken language understanding (SLU) predicts intent directly from audio using a single model. It promises to improve the performance of assistant systems by leveraging acoustic information lost in the intermediate textual representation and preventing cascading errors from Automatic Speech Recognition (ASR). Further, having one unified model has efficiency advantages when deploying assistant systems on-device. However, the limited number of public audio datasets with semantic parse labels hinders the research progress in this area. In this paper, we release the Spoken Task-Oriented semantic Parsing (STOP) dataset, the largest and most complex SLU dataset to be publicly available. Additionally, we define low-resource splits to establish a benchmark for improving SLU when limited labeled data is available. Furthermore, in addition to the human-recorded audio, we are releasing a TTS-generated version to benchmark the performance for low-resource domain adaptation of end-to-end SLU systems. Initial experimentation show end-to-end SLU models performing slightly worse than their cascaded counterparts, which we hope encourages future work in this direction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.10643v3-abstract-full').style.display = 'none'; document.getElementById('2207.10643v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.09185">arXiv:2205.09185</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.09185">pdf</a>, <a href="https://arxiv.org/format/2205.09185">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Experiment">hep-ex</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Nuclear Experiment">nucl-ex</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.nima.2022.167748">10.1016/j.nima.2022.167748 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AI-assisted Optimization of the ECCE Tracking System at the Electron Ion Collider </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fanelli%2C+C">C. Fanelli</a>, <a href="/search/cs?searchtype=author&amp;query=Papandreou%2C+Z">Z. Papandreou</a>, <a href="/search/cs?searchtype=author&amp;query=Suresh%2C+K">K. Suresh</a>, <a href="/search/cs?searchtype=author&amp;query=Adkins%2C+J+K">J. K. Adkins</a>, <a href="/search/cs?searchtype=author&amp;query=Akiba%2C+Y">Y. Akiba</a>, <a href="/search/cs?searchtype=author&amp;query=Albataineh%2C+A">A. Albataineh</a>, <a href="/search/cs?searchtype=author&amp;query=Amaryan%2C+M">M. Amaryan</a>, <a href="/search/cs?searchtype=author&amp;query=Arsene%2C+I+C">I. C. Arsene</a>, <a href="/search/cs?searchtype=author&amp;query=Gayoso%2C+C+A">C. Ayerbe Gayoso</a>, <a href="/search/cs?searchtype=author&amp;query=Bae%2C+J">J. Bae</a>, <a href="/search/cs?searchtype=author&amp;query=Bai%2C+X">X. Bai</a>, <a href="/search/cs?searchtype=author&amp;query=Baker%2C+M+D">M. D. Baker</a>, <a href="/search/cs?searchtype=author&amp;query=Bashkanov%2C+M">M. Bashkanov</a>, <a href="/search/cs?searchtype=author&amp;query=Bellwied%2C+R">R. Bellwied</a>, <a href="/search/cs?searchtype=author&amp;query=Benmokhtar%2C+F">F. Benmokhtar</a>, <a href="/search/cs?searchtype=author&amp;query=Berdnikov%2C+V">V. Berdnikov</a>, <a href="/search/cs?searchtype=author&amp;query=Bernauer%2C+J+C">J. C. Bernauer</a>, <a href="/search/cs?searchtype=author&amp;query=Bock%2C+F">F. Bock</a>, <a href="/search/cs?searchtype=author&amp;query=Boeglin%2C+W">W. Boeglin</a>, <a href="/search/cs?searchtype=author&amp;query=Borysova%2C+M">M. Borysova</a>, <a href="/search/cs?searchtype=author&amp;query=Brash%2C+E">E. Brash</a>, <a href="/search/cs?searchtype=author&amp;query=Brindza%2C+P">P. Brindza</a>, <a href="/search/cs?searchtype=author&amp;query=Briscoe%2C+W+J">W. J. Briscoe</a>, <a href="/search/cs?searchtype=author&amp;query=Brooks%2C+M">M. Brooks</a>, <a href="/search/cs?searchtype=author&amp;query=Bueltmann%2C+S">S. Bueltmann</a> , et al. (258 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.09185v2-abstract-short" style="display: inline;"> The Electron-Ion Collider (EIC) is a cutting-edge accelerator facility that will study the nature of the &#34;glue&#34; that binds the building blocks of the visible matter in the universe. The proposed experiment will be realized at Brookhaven National Laboratory in approximately 10 years from now, with detector design and R&amp;D currently ongoing. Notably, EIC is one of the first large-scale facilities to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.09185v2-abstract-full').style.display = 'inline'; document.getElementById('2205.09185v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.09185v2-abstract-full" style="display: none;"> The Electron-Ion Collider (EIC) is a cutting-edge accelerator facility that will study the nature of the &#34;glue&#34; that binds the building blocks of the visible matter in the universe. The proposed experiment will be realized at Brookhaven National Laboratory in approximately 10 years from now, with detector design and R&amp;D currently ongoing. Notably, EIC is one of the first large-scale facilities to leverage Artificial Intelligence (AI) already starting from the design and R&amp;D phases. The EIC Comprehensive Chromodynamics Experiment (ECCE) is a consortium that proposed a detector design based on a 1.5T solenoid. The EIC detector proposal review concluded that the ECCE design will serve as the reference design for an EIC detector. Herein we describe a comprehensive optimization of the ECCE tracker using AI. The work required a complex parametrization of the simulated detector system. Our approach dealt with an optimization problem in a multidimensional design space driven by multiple objectives that encode the detector performance, while satisfying several mechanical constraints. We describe our strategy and show results obtained for the ECCE tracking system. The AI-assisted design is agnostic to the simulation framework and can be extended to other sub-detectors or to a system of sub-detectors to further optimize the performance of the EIC detector. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.09185v2-abstract-full').style.display = 'none'; document.getElementById('2205.09185v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 18 figures, 2 appendices, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.03759">arXiv:2205.03759</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2205.03759">pdf</a>, <a href="https://arxiv.org/format/2205.03759">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Silence is Sweeter Than Speech: Self-Supervised Model Using Silence to Store Speaker Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Feng%2C+C">Chi-Luen Feng</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.03759v1-abstract-short" style="display: inline;"> Self-Supervised Learning (SSL) has made great strides recently. SSL speech models achieve decent performance on a wide range of downstream tasks, suggesting that they extract different aspects of information from speech. However, how SSL models store various information in hidden representations without interfering is still poorly understood. Taking the recently successful SSL model, HuBERT, as an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03759v1-abstract-full').style.display = 'inline'; document.getElementById('2205.03759v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.03759v1-abstract-full" style="display: none;"> Self-Supervised Learning (SSL) has made great strides recently. SSL speech models achieve decent performance on a wide range of downstream tasks, suggesting that they extract different aspects of information from speech. However, how SSL models store various information in hidden representations without interfering is still poorly understood. Taking the recently successful SSL model, HuBERT, as an example, we explore how the SSL model processes and stores speaker information in the representation. We found that HuBERT stores speaker information in representations whose positions correspond to silences in a waveform. There are several pieces of evidence. (1) We find that the utterances with more silent parts in the waveforms have better Speaker Identification (SID) accuracy. (2) If we use the whole utterances for SID, the silence part always contributes more to the SID task. (3) If we only use the representation of a part of the utterance for SID, the silenced part has higher accuracy than the other parts. Our findings not only contribute to a better understanding of SSL models but also improve performance. By simply adding silence to the original waveform, HuBERT improved its accuracy on SID by nearly 2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03759v1-abstract-full').style.display = 'none'; document.getElementById('2205.03759v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.11806">arXiv:2204.11806</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.11806">pdf</a>, <a href="https://arxiv.org/format/2204.11806">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TASLP.2023.3301212">10.1109/TASLP.2023.3301212 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Parallel Synthesis for Autoregressive Speech Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Da-rong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.11806v3-abstract-short" style="display: inline;"> Autoregressive neural vocoders have achieved outstanding performance in speech synthesis tasks such as text-to-speech and voice conversion. An autoregressive vocoder predicts a sample at some time step conditioned on those at previous time steps. Though it synthesizes natural human speech, the iterative generation inevitably makes the synthesis time proportional to the utterance length, leading to&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11806v3-abstract-full').style.display = 'inline'; document.getElementById('2204.11806v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.11806v3-abstract-full" style="display: none;"> Autoregressive neural vocoders have achieved outstanding performance in speech synthesis tasks such as text-to-speech and voice conversion. An autoregressive vocoder predicts a sample at some time step conditioned on those at previous time steps. Though it synthesizes natural human speech, the iterative generation inevitably makes the synthesis time proportional to the utterance length, leading to low efficiency. Many works were dedicated to generating the whole speech sequence in parallel and proposed GAN-based, flow-based, and score-based vocoders. This paper proposed a new thought for the autoregressive generation. Instead of iteratively predicting samples in a time sequence, the proposed model performs frequency-wise autoregressive generation (FAR) and bit-wise autoregressive generation (BAR) to synthesize speech. In FAR, a speech utterance is split into frequency subbands, and a subband is generated conditioned on the previously generated one. Similarly, in BAR, an 8-bit quantized signal is generated iteratively from the first bit. By redesigning the autoregressive method to compute in domains other than the time domain, the number of iterations in the proposed model is no longer proportional to the utterance length but to the number of subbands/bits, significantly increasing inference efficiency. Besides, a post-filter is employed to sample signals from output posteriors; its training objective is designed based on the characteristics of the proposed methods. Experimental results show that the proposed model can synthesize speech faster than real-time without GPU acceleration. Compared with baseline vocoders, the proposed model achieves better MUSHRA results and shows good generalization ability for unseen speakers and 44 kHz speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.11806v3-abstract-full').style.display = 'none'; document.getElementById('2204.11806v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE/ACM Transactions on Audio, Speech, and Language Processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.05486">arXiv:2204.05486</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.05486">pdf</a>, <a href="https://arxiv.org/format/2204.05486">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Neural Graph Matching for Modification Similarity Applied to Electronic Document Comparison </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Fang Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+C">Chiching Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.05486v2-abstract-short" style="display: inline;"> In this paper, we present a novel neural graph matching approach applied to document comparison. Document comparison is a common task in the legal and financial industries. In some cases, the most important differences may be the addition or omission of words, sentences, clauses, or paragraphs. However, it is a challenging task without recording or tracing whole edited process. Under many temporal&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05486v2-abstract-full').style.display = 'inline'; document.getElementById('2204.05486v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.05486v2-abstract-full" style="display: none;"> In this paper, we present a novel neural graph matching approach applied to document comparison. Document comparison is a common task in the legal and financial industries. In some cases, the most important differences may be the addition or omission of words, sentences, clauses, or paragraphs. However, it is a challenging task without recording or tracing whole edited process. Under many temporal uncertainties, we explore the potentiality of our approach to proximate the accurate comparison to make sure which element blocks have a relation of edition with others. In beginning, we apply a document layout analysis that combining traditional and modern technics to segment layout in blocks of various types appropriately. Then we transform this issue to a problem of layout graph matching with textual awareness. About graph matching, it is a long-studied problem with a broad range of applications. However, different from previous works focusing on visual images or structural layout, we also bring textual features into our model for adapting this domain. Specifically, based on the electronic document, we introduce an encoder to deal with the visual presentation decoding from PDF. Additionally, because the modifications can cause the inconsistency of document layout analysis between modified documents and the blocks can be merged and split, Sinkhorn divergence is adopted in our graph neural approach, which tries to overcome both these issues with many-to-many block matching. We demonstrate this on two categories of layouts, as follows., legal agreement and scientific articles, collected from our real-case datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.05486v2-abstract-full').style.display = 'none'; document.getElementById('2204.05486v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.00630">arXiv:2204.00630</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.00630">pdf</a>, <a href="https://arxiv.org/format/2204.00630">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Extremely Low-light Image Enhancement with Scene Text Restoration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Pohao Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+C">Che-Tsung Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Ng%2C+C+C">Chun Chet Ng</a>, <a href="/search/cs?searchtype=author&amp;query=Kew%2C+J">Jie-Long Kew</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+M+Y">Mei Yih Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Lai%2C+S">Shang-Hong Lai</a>, <a href="/search/cs?searchtype=author&amp;query=Chan%2C+C+S">Chee Seng Chan</a>, <a href="/search/cs?searchtype=author&amp;query=Zach%2C+C">Christopher Zach</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.00630v1-abstract-short" style="display: inline;"> Deep learning-based methods have made impressive progress in enhancing extremely low-light images - the image quality of the reconstructed images has generally improved. However, we found out that most of these methods could not sufficiently recover the image details, for instance, the texts in the scene. In this paper, a novel image enhancement framework is proposed to precisely restore the scene&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00630v1-abstract-full').style.display = 'inline'; document.getElementById('2204.00630v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.00630v1-abstract-full" style="display: none;"> Deep learning-based methods have made impressive progress in enhancing extremely low-light images - the image quality of the reconstructed images has generally improved. However, we found out that most of these methods could not sufficiently recover the image details, for instance, the texts in the scene. In this paper, a novel image enhancement framework is proposed to precisely restore the scene texts, as well as the overall quality of the image simultaneously under extremely low-light images conditions. Mainly, we employed a self-regularised attention map, an edge map, and a novel text detection loss. In addition, leveraging synthetic low-light images is beneficial for image enhancement on the genuine ones in terms of text detection. The quantitative and qualitative experimental results have shown that the proposed model outperforms state-of-the-art methods in image restoration, text detection, and text spotting on See In the Dark and ICDAR15 datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00630v1-abstract-full').style.display = 'none'; document.getElementById('2204.00630v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.00170">arXiv:2204.00170</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2204.00170">pdf</a>, <a href="https://arxiv.org/format/2204.00170">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Universal Adaptor: Converting Mel-Spectrograms Between Different Configurations for Speech Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+F">Fan-Lin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Da-rong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.00170v2-abstract-short" style="display: inline;"> Most recent speech synthesis systems are composed of a synthesizer and a vocoder. However, the existing synthesizers and vocoders can only be matched to acoustic features extracted with a specific configuration. Hence, we can&#39;t combine arbitrary synthesizers and vocoders together to form a complete system, not to mention apply to a newly developed model. In this paper, we proposed Universal Adapto&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00170v2-abstract-full').style.display = 'inline'; document.getElementById('2204.00170v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.00170v2-abstract-full" style="display: none;"> Most recent speech synthesis systems are composed of a synthesizer and a vocoder. However, the existing synthesizers and vocoders can only be matched to acoustic features extracted with a specific configuration. Hence, we can&#39;t combine arbitrary synthesizers and vocoders together to form a complete system, not to mention apply to a newly developed model. In this paper, we proposed Universal Adaptor, which takes a Mel-spectrogram parametrized by the source configuration and converts it into a Mel-spectrogram parametrized by the target configuration, as long as we feed in the source and the target configurations. Experiments show that the quality of speeches synthesized from our output of Universal Adaptor is comparable to those synthesized from ground truth Mel-spectrogram no matter in single-speaker or multi-speaker scenarios. Moreover, Universal Adaptor can be applied in the recent TTS systems and voice conversion systems without dropping quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.00170v2-abstract-full').style.display = 'none'; document.getElementById('2204.00170v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2107.00309">arXiv:2107.00309</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2107.00309">pdf</a>, <a href="https://arxiv.org/format/2107.00309">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Sample Detection for Speaker Verification by Neural Vocoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Haibin Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Ji Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shanshan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+S">Shen Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Kang%2C+J">Jian Kang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2107.00309v4-abstract-short" style="display: inline;"> Automatic speaker verification (ASV), one of the most important technology for biometric identification, has been widely adopted in security-critical applications. However, ASV is seriously vulnerable to recently emerged adversarial attacks, yet effective countermeasures against them are limited. In this paper, we adopt neural vocoders to spot adversarial samples for ASV. We use the neural vocoder&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.00309v4-abstract-full').style.display = 'inline'; document.getElementById('2107.00309v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2107.00309v4-abstract-full" style="display: none;"> Automatic speaker verification (ASV), one of the most important technology for biometric identification, has been widely adopted in security-critical applications. However, ASV is seriously vulnerable to recently emerged adversarial attacks, yet effective countermeasures against them are limited. In this paper, we adopt neural vocoders to spot adversarial samples for ASV. We use the neural vocoder to re-synthesize audio and find that the difference between the ASV scores for the original and re-synthesized audio is a good indicator for discrimination between genuine and adversarial samples. This effort is, to the best of our knowledge, among the first to pursue such a technical direction for detecting time-domain adversarial samples for ASV, and hence there is a lack of established baselines for comparison. Consequently, we implement the Griffin-Lim algorithm as the detection baseline. The proposed approach achieves effective detection performance that outperforms the baselines in all the settings. We also show that the neural vocoder adopted in the detection framework is dataset-independent. Our codes will be made open-source for future works to do fair comparison. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2107.00309v4-abstract-full').style.display = 'none'; document.getElementById('2107.00309v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.04088">arXiv:2103.04088</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.04088">pdf</a>, <a href="https://arxiv.org/format/2103.04088">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Investigating on Incorporating Pretrained and Learnable Speaker Representations for Multi-Speaker Multi-Style Text-to-Speech </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chien%2C+C">Chung-Ming Chien</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+J">Jheng-Hao Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.04088v5-abstract-short" style="display: inline;"> The few-shot multi-speaker multi-style voice cloning task is to synthesize utterances with voice and speaking style similar to a reference speaker given only a few reference samples. In this work, we investigate different speaker representations and proposed to integrate pretrained and learnable speaker representations. Among different types of embeddings, the embedding pretrained by voice convers&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.04088v5-abstract-full').style.display = 'inline'; document.getElementById('2103.04088v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.04088v5-abstract-full" style="display: none;"> The few-shot multi-speaker multi-style voice cloning task is to synthesize utterances with voice and speaking style similar to a reference speaker given only a few reference samples. In this work, we investigate different speaker representations and proposed to integrate pretrained and learnable speaker representations. Among different types of embeddings, the embedding pretrained by voice conversion achieves the best performance. The FastSpeech 2 model combined with both pretrained and learnable speaker representations shows great generalization ability on few-shot speakers and achieved 2nd place in the one-shot track of the ICASSP 2021 M2VoC challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.04088v5-abstract-full').style.display = 'none'; document.getElementById('2103.04088v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2021, in the special session of ICASSP 2021 M2VoC Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.07412">arXiv:2005.07412</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.07412">pdf</a>, <a href="https://arxiv.org/format/2005.07412">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> WG-WaveNet: Real-Time High-Fidelity Speech Synthesis without GPU </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.07412v3-abstract-short" style="display: inline;"> In this paper, we propose WG-WaveNet, a fast, lightweight, and high-quality waveform generation model. WG-WaveNet is composed of a compact flow-based model and a post-filter. The two components are jointly trained by maximizing the likelihood of the training data and optimizing loss functions on the frequency domains. As we design a flow-based model that is heavily compressed, the proposed model r&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07412v3-abstract-full').style.display = 'inline'; document.getElementById('2005.07412v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.07412v3-abstract-full" style="display: none;"> In this paper, we propose WG-WaveNet, a fast, lightweight, and high-quality waveform generation model. WG-WaveNet is composed of a compact flow-based model and a post-filter. The two components are jointly trained by maximizing the likelihood of the training data and optimizing loss functions on the frequency domains. As we design a flow-based model that is heavily compressed, the proposed model requires much less computational resources compared to other waveform generation models during both training and inference time; even though the model is highly compressed, the post-filter maintains the quality of generated waveform. Our PyTorch implementation can be trained using less than 8 GB GPU memory and generates audio samples at a rate of more than 960 kHz on an NVIDIA 1080Ti GPU. Furthermore, even if synthesizing on a CPU, we show that the proposed method is capable of generating 44.1 kHz speech waveform 1.2 times faster than real-time. Experiments also show that the quality of generated audio is comparable to those of other methods. Audio samples are publicly available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.07412v3-abstract-full').style.display = 'none'; document.getElementById('2005.07412v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">INTERSPEECH 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.03457">arXiv:2005.03457</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2005.03457">pdf</a>, <a href="https://arxiv.org/format/2005.03457">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> NTIRE 2020 Challenge on NonHomogeneous Dehazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ancuti%2C+C+O">Codruta O. Ancuti</a>, <a href="/search/cs?searchtype=author&amp;query=Ancuti%2C+C">Cosmin Ancuti</a>, <a href="/search/cs?searchtype=author&amp;query=Vasluianu%2C+F">Florin-Alexandru Vasluianu</a>, <a href="/search/cs?searchtype=author&amp;query=Timofte%2C+R">Radu Timofte</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Haiyan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+Y">Yuan Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Qu%2C+Y">Yanyun Qu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+L">Lizhuang Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Ziling Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Q">Qili Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Chao%2C+J">Ju-Chin Chao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+T">Tsung-Shan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+P">Peng-Wen Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Min Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Liao%2C+T">Tzu-Yi Liao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+C">Chung-En Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+P">Pei-Yuan Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Do%2C+J">Jeonghyeok Do</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">Jongmin Park</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+M">Munchurl Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Metwaly%2C+K">Kareem Metwaly</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuelu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+T">Tiantong Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Monga%2C+V">Vishal Monga</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.03457v1-abstract-short" style="display: inline;"> This paper reviews the NTIRE 2020 Challenge on NonHomogeneous Dehazing of images (restoration of rich details in hazy image). We focus on the proposed solutions and their results evaluated on NH-Haze, a novel dataset consisting of 55 pairs of real haze free and nonhomogeneous hazy images recorded outdoor. NH-Haze is the first realistic nonhomogeneous haze dataset that provides ground truth images.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.03457v1-abstract-full').style.display = 'inline'; document.getElementById('2005.03457v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.03457v1-abstract-full" style="display: none;"> This paper reviews the NTIRE 2020 Challenge on NonHomogeneous Dehazing of images (restoration of rich details in hazy image). We focus on the proposed solutions and their results evaluated on NH-Haze, a novel dataset consisting of 55 pairs of real haze free and nonhomogeneous hazy images recorded outdoor. NH-Haze is the first realistic nonhomogeneous haze dataset that provides ground truth images. The nonhomogeneous haze has been produced using a professional haze generator that imitates the real conditions of haze scenes. 168 participants registered in the challenge and 27 teams competed in the final testing phase. The proposed solutions gauge the state-of-the-art in image dehazing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.03457v1-abstract-full').style.display = 'none'; document.getElementById('2005.03457v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR Workshops Proceedings 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.02461">arXiv:1912.02461</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1912.02461">pdf</a>, <a href="https://arxiv.org/ps/1912.02461">ps</a>, <a href="https://arxiv.org/format/1912.02461">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Neural Vocoding for Speech Generation: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chun-hsuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.02461v3-abstract-short" style="display: inline;"> Recently, neural vocoders have been widely used in speech synthesis tasks, including text-to-speech and voice conversion. However, when encountering data distribution mismatch between training and inference, neural vocoders trained on real data often degrade in voice quality for unseen scenarios. In this paper, we train four common neural vocoders, including WaveNet, WaveRNN, FFTNet, Parallel Wave&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02461v3-abstract-full').style.display = 'inline'; document.getElementById('1912.02461v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.02461v3-abstract-full" style="display: none;"> Recently, neural vocoders have been widely used in speech synthesis tasks, including text-to-speech and voice conversion. However, when encountering data distribution mismatch between training and inference, neural vocoders trained on real data often degrade in voice quality for unseen scenarios. In this paper, we train four common neural vocoders, including WaveNet, WaveRNN, FFTNet, Parallel WaveGAN alternately on five different datasets. To study the robustness of neural vocoders, we evaluate the models using acoustic features from seen/unseen speakers, seen/unseen languages, a text-to-speech model, and a voice conversion model. We found out that the speaker variety is much more important for achieving a universal vocoder than the language. Through our experiments, we show that WaveNet and WaveRNN are more suitable for text-to-speech models, while Parallel WaveGAN is more suitable for voice conversion applications. Great amount of subjective MOS results in naturalness for all vocoders are presented for future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02461v3-abstract-full').style.display = 'none'; document.getElementById('1912.02461v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to INTERSPEECH 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1910.12638">arXiv:1910.12638</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1910.12638">pdf</a>, <a href="https://arxiv.org/format/1910.12638">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICASSP40776.2020.9054458">10.1109/ICASSP40776.2020.9054458 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Mockingjay: Unsupervised Speech Representation Learning with Deep Bidirectional Transformer Encoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Chi%2C+P">Po-Han Chi</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1910.12638v2-abstract-short" style="display: inline;"> We present Mockingjay as a new speech representation learning approach, where bidirectional Transformer encoders are pre-trained on a large amount of unlabeled speech. Previous speech representation methods learn through conditioning on past frames and predicting information about future frames. Whereas Mockingjay is designed to predict the current frame through jointly conditioning on both past a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12638v2-abstract-full').style.display = 'inline'; document.getElementById('1910.12638v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1910.12638v2-abstract-full" style="display: none;"> We present Mockingjay as a new speech representation learning approach, where bidirectional Transformer encoders are pre-trained on a large amount of unlabeled speech. Previous speech representation methods learn through conditioning on past frames and predicting information about future frames. Whereas Mockingjay is designed to predict the current frame through jointly conditioning on both past and future contexts. The Mockingjay representation improves performance for a wide range of downstream tasks, including phoneme classification, speaker recognition, and sentiment classification on spoken content, while outperforming other approaches. Mockingjay is empirically powerful and can be fine-tuned with downstream models, with only 2 epochs we further improve performance dramatically. In a low resource setting with only 0.1% of labeled data, we outperform the result of Mel-features that uses all 100% labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1910.12638v2-abstract-full').style.display = 'none'; document.getElementById('1910.12638v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2020, Lecture Session</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.11899">arXiv:1909.11899</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1909.11899">pdf</a>, <a href="https://arxiv.org/ps/1909.11899">ps</a>, <a href="https://arxiv.org/format/1909.11899">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Parameter Estimation of Brain Mechanisms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-Ya Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.11899v1-abstract-short" style="display: inline;"> Demystifying effective connectivity among neuronal populations has become the trend to understand the brain mechanisms of Parkinson&#39;s disease, schizophrenia, mild traumatic brain injury, and many other unlisted neurological diseases. Dynamic modeling is a state-of-the-art approach to explore various connectivities among neuronal populations corresponding to different electrophysiological responses&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11899v1-abstract-full').style.display = 'inline'; document.getElementById('1909.11899v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.11899v1-abstract-full" style="display: none;"> Demystifying effective connectivity among neuronal populations has become the trend to understand the brain mechanisms of Parkinson&#39;s disease, schizophrenia, mild traumatic brain injury, and many other unlisted neurological diseases. Dynamic modeling is a state-of-the-art approach to explore various connectivities among neuronal populations corresponding to different electrophysiological responses. Through estimating the parameters in the dynamic models, including the strengths and propagation delays of the electrophysiological signals, the discovery of the underlying connectivities can lead to the elucidation of functional brain mechanisms. In this report, we survey six dynamic models that describe the intrinsic function of a single neuronal/subneuronal population and three effective network estimation methods that can trace the connections among the neuronal/subneuronal populations. The six dynamic models are event related potential, local field potential, conductance-based neural mass model, mean field model, neural field model, and canonical micro-circuits; the three effective network estimation approaches are dynamic causal modeling, structural causal model, and vector autoregression. Subsequently, we discuss dynamic parameter estimation methods including variational Bayesian, particle filtering, Metropolis-Hastings algorithm, Gauss-Newton algorithm, collocation method, and constrained optimization. We summarize the merits and drawbacks of each model, network estimation approach, and parameter estimation method. In addition, we demonstrate an exemplary effective network estimation problem statement. Last, we identify possible future work and challenges to develop an elevated package. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11899v1-abstract-full').style.display = 'none'; document.getElementById('1909.11899v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1905.11563">arXiv:1905.11563</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1905.11563">pdf</a>, <a href="https://arxiv.org/format/1905.11563">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.21437/Interspeech.2019-2048">10.21437/Interspeech.2019-2048 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Unsupervised End-to-End Learning of Discrete Linguistic Units for Voice Conversion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1905.11563v3-abstract-short" style="display: inline;"> We present an unsupervised end-to-end training scheme where we discover discrete subword units from speech without using any labels. The discrete subword units are learned under an ASR-TTS autoencoder reconstruction setting, where an ASR-Encoder is trained to discover a set of common linguistic units given a variety of speakers, and a TTS-Decoder trained to project the discovered units back to the&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.11563v3-abstract-full').style.display = 'inline'; document.getElementById('1905.11563v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1905.11563v3-abstract-full" style="display: none;"> We present an unsupervised end-to-end training scheme where we discover discrete subword units from speech without using any labels. The discrete subword units are learned under an ASR-TTS autoencoder reconstruction setting, where an ASR-Encoder is trained to discover a set of common linguistic units given a variety of speakers, and a TTS-Decoder trained to project the discovered units back to the designated speech. We propose a discrete encoding method, Multilabel-Binary Vectors (MBV), to make the ASR-TTS autoencoder differentiable. We found that the proposed encoding method offers automatic extraction of speech content from speaker style, and is sufficient to cover full linguistic content in a given language. Therefore, the TTS-Decoder can synthesize speech with the same content as the input of ASR-Encoder but with different speaker characteristics, which achieves voice conversion (VC). We further improve the quality of VC using adversarial training, where we train a TTS-Patcher that augments the output of TTS-Decoder. Objective and subjective evaluations show that the proposed approach offers strong VC results as it eliminates speaker identity while preserving content within speech. In the ZeroSpeech 2019 Challenge, we achieved outstanding performance in terms of low bitrate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1905.11563v3-abstract-full').style.display = 'none'; document.getElementById('1905.11563v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 May, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2019, Graz, Austria</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Interspeech 2019 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.03113">arXiv:1808.03113</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.03113">pdf</a>, <a href="https://arxiv.org/format/1808.03113">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Rhythm-Flexible Voice Conversion without Parallel Data Using Cycle-GAN over Phoneme Posteriorgram Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yeh%2C+C">Cheng-chieh Yeh</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Chou%2C+J">Ju-chieh Chou</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+L">Lin-shan Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.03113v1-abstract-short" style="display: inline;"> Speaking rate refers to the average number of phonemes within some unit time, while the rhythmic patterns refer to duration distributions for realizations of different phonemes within different phonetic structures. Both are key components of prosody in speech, which is different for different speakers. Models like cycle-consistent adversarial network (Cycle-GAN) and variational auto-encoder (VAE)&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.03113v1-abstract-full').style.display = 'inline'; document.getElementById('1808.03113v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.03113v1-abstract-full" style="display: none;"> Speaking rate refers to the average number of phonemes within some unit time, while the rhythmic patterns refer to duration distributions for realizations of different phonemes within different phonetic structures. Both are key components of prosody in speech, which is different for different speakers. Models like cycle-consistent adversarial network (Cycle-GAN) and variational auto-encoder (VAE) have been successfully applied to voice conversion tasks without parallel data. However, due to the neural network architectures and feature vectors chosen for these approaches, the length of the predicted utterance has to be fixed to that of the input utterance, which limits the flexibility in mimicking the speaking rates and rhythmic patterns for the target speaker. On the other hand, sequence-to-sequence learning model was used to remove the above length constraint, but parallel training data are needed. In this paper, we propose an approach utilizing sequence-to-sequence model trained with unsupervised Cycle-GAN to perform the transformation between the phoneme posteriorgram sequences for different speakers. In this way, the length constraint mentioned above is removed to offer rhythm-flexible voice conversion without requiring parallel data. Preliminary evaluation on two datasets showed very encouraging results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.03113v1-abstract-full').style.display = 'none'; document.getElementById('1808.03113v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures, Submitted to SLT 2018</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/0710.4645">arXiv:0710.4645</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/0710.4645">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> At-Speed Logic BIST for IP Cores </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cheon%2C+B">B. Cheon</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+E">E. Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L+-">L. -T. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+X">X. Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Hsu%2C+P">P. Hsu</a>, <a href="/search/cs?searchtype=author&amp;query=Cho%2C+J">J. Cho</a>, <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">J. Park</a>, <a href="/search/cs?searchtype=author&amp;query=Chao%2C+H">H. Chao</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+S">S. Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="0710.4645v1-abstract-short" style="display: inline;"> This paper describes a flexible logic BIST scheme that features high fault coverage achieved by fault-simulation guided test point insertion, real at-speed test capability for multi-clock designs without clock frequency manipulation, and easy physical implementation due to the use of a low-speed SE signal. Application results of this scheme to two widely used IP cores are also reported. </span> <span class="abstract-full has-text-grey-dark mathjax" id="0710.4645v1-abstract-full" style="display: none;"> This paper describes a flexible logic BIST scheme that features high fault coverage achieved by fault-simulation guided test point insertion, real at-speed test capability for multi-clock designs without clock frequency manipulation, and easy physical implementation due to the use of a low-speed SE signal. Application results of this scheme to two widely used IP cores are also reported. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('0710.4645v1-abstract-full').style.display = 'none'; document.getElementById('0710.4645v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2007; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2007. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted on behalf of EDAA (http://www.edaa.com/)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Dans Design, Automation and Test in Europe - DATE&#39;05, Munich : Allemagne (2005) </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10