CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 382 results for author: <span class="mathjax">Li, B</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Li%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Li, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Li%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Li, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.20762">arXiv:2502.20762</a> <span> [<a href="https://arxiv.org/pdf/2502.20762">pdf</a>, <a href="https://arxiv.org/format/2502.20762">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Practical Real-Time Neural Video Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jia%2C+Z">Zhaoyang Jia</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+W">Wenxuan Xie</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+L">Linfeng Qi</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Houqiang Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yan Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.20762v1-abstract-short" style="display: inline;"> We introduce a practical real-time neural video codec (NVC) designed to deliver high compression ratio, low latency and broad versatility. In practice, the coding speed of NVCs depends on 1) computational costs, and 2) non-computational operational costs, such as memory I/O and the number of function calls. While most efficient NVCs prioritize reducing computational cost, we identify operational c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20762v1-abstract-full').style.display = 'inline'; document.getElementById('2502.20762v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.20762v1-abstract-full" style="display: none;"> We introduce a practical real-time neural video codec (NVC) designed to deliver high compression ratio, low latency and broad versatility. In practice, the coding speed of NVCs depends on 1) computational costs, and 2) non-computational operational costs, such as memory I/O and the number of function calls. While most efficient NVCs prioritize reducing computational cost, we identify operational cost as the primary bottleneck to achieving higher coding speed. Leveraging this insight, we introduce a set of efficiency-driven design improvements focused on minimizing operational costs. Specifically, we employ implicit temporal modeling to eliminate complex explicit motion modules, and use single low-resolution latent representations rather than progressive downsampling. These innovations significantly accelerate NVC without sacrificing compression quality. Additionally, we implement model integerization for consistent cross-device coding and a module-bank-based rate control scheme to improve practical adaptability. Experiments show our proposed DCVC-RT achieves an impressive average encoding/decoding speed at 125.2/112.8 fps (frames per second) for 1080p video, while saving an average of 21% in bitrate compared to H.266/VTM. The code is available at https://github.com/microsoft/DCVC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.20762v1-abstract-full').style.display = 'none'; document.getElementById('2502.20762v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2025. The code is available at https://github.com/microsoft/DCVC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.19568">arXiv:2502.19568</a> <span> [<a href="https://arxiv.org/pdf/2502.19568">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> PhenoProfiler: Advancing Phenotypic Learning for Image-based Drug Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bob Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chengyang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+M">Minghao Zhou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+W">Weiliang Huang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shihang Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Q">Qing Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Mengran Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yong Zhang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Q">Qianqian Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.19568v1-abstract-short" style="display: inline;"> In the field of image-based drug discovery, capturing the phenotypic response of cells to various drug treatments and perturbations is a crucial step. However, existing methods require computationally extensive and complex multi-step procedures, which can introduce inefficiencies, limit generalizability, and increase potential errors. To address these challenges, we present PhenoProfiler, an innov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19568v1-abstract-full').style.display = 'inline'; document.getElementById('2502.19568v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.19568v1-abstract-full" style="display: none;"> In the field of image-based drug discovery, capturing the phenotypic response of cells to various drug treatments and perturbations is a crucial step. However, existing methods require computationally extensive and complex multi-step procedures, which can introduce inefficiencies, limit generalizability, and increase potential errors. To address these challenges, we present PhenoProfiler, an innovative model designed to efficiently and effectively extract morphological representations, enabling the elucidation of phenotypic changes induced by treatments. PhenoProfiler is designed as an end-to-end tool that processes whole-slide multi-channel images directly into low-dimensional quantitative representations, eliminating the extensive computational steps required by existing methods. It also includes a multi-objective learning module to enhance robustness, accuracy, and generalization in morphological representation learning. PhenoProfiler is rigorously evaluated on large-scale publicly available datasets, including over 230,000 whole-slide multi-channel images in end-to-end scenarios and more than 8.42 million single-cell images in non-end-to-end settings. Across these benchmarks, PhenoProfiler consistently outperforms state-of-the-art methods by up to 20%, demonstrating substantial improvements in both accuracy and robustness. Furthermore, PhenoProfiler uses a tailored phenotype correction strategy to emphasize relative phenotypic changes under treatments, facilitating the detection of biologically meaningful signals. UMAP visualizations of treatment profiles demonstrate PhenoProfiler ability to effectively cluster treatments with similar biological annotations, thereby enhancing interpretability. These findings establish PhenoProfiler as a scalable, generalizable, and robust tool for phenotypic learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.19568v1-abstract-full').style.display = 'none'; document.getElementById('2502.19568v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.18185">arXiv:2502.18185</a> <span> [<a href="https://arxiv.org/pdf/2502.18185">pdf</a>, <a href="https://arxiv.org/format/2502.18185">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VesselSAM: Leveraging SAM for Aortic Vessel Segmentation with LoRA and Atrous Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Iltaf%2C+A">Adnan Iltaf</a>, <a href="/search/eess?searchtype=author&query=Ahmed%2C+R+M">Rayan Merghani Ahmed</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shoujun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.18185v1-abstract-short" style="display: inline;"> Medical image segmentation is crucial for clinical diagnosis and treatment planning, particularly for complex anatomical structures like vessels. In this work, we propose VesselSAM, a modified version of the Segmentation Anything Model (SAM), specifically designed for aortic vessel segmentation. VesselSAM incorporates AtrousLoRA, a novel module that combines Atrous Attention with Low-Rank Adaptati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18185v1-abstract-full').style.display = 'inline'; document.getElementById('2502.18185v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.18185v1-abstract-full" style="display: none;"> Medical image segmentation is crucial for clinical diagnosis and treatment planning, particularly for complex anatomical structures like vessels. In this work, we propose VesselSAM, a modified version of the Segmentation Anything Model (SAM), specifically designed for aortic vessel segmentation. VesselSAM incorporates AtrousLoRA, a novel module that combines Atrous Attention with Low-Rank Adaptation (LoRA), to improve segmentation performance. Atrous Attention enables the model to capture multi-scale contextual information, preserving both fine local details and broader global context. At the same time, LoRA facilitates efficient fine-tuning of the frozen SAM image encoder, reducing the number of trainable parameters and ensuring computational efficiency. We evaluate VesselSAM on two challenging datasets: the Aortic Vessel Tree (AVT) dataset and the Type-B Aortic Dissection (TBAD) dataset. VesselSAM achieves state-of-the-art performance with DSC scores of 93.50\%, 93.25\%, 93.02\%, and 93.26\% across multiple medical centers. Our results demonstrate that VesselSAM delivers high segmentation accuracy while significantly reducing computational overhead compared to existing large-scale models. This development paves the way for enhanced AI-based aortic vessel segmentation in clinical environments. The code and models will be released at https://github.com/Adnan-CAS/AtrousLora. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.18185v1-abstract-full').style.display = 'none'; document.getElementById('2502.18185v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE JBHI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11946">arXiv:2502.11946</a> <span> [<a href="https://arxiv.org/pdf/2502.11946">pdf</a>, <a href="https://arxiv.org/format/2502.11946">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+A">Ailin Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+B">Boyong Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bruce Wang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+C">Chao Yan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+C">Chen Hu</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chengli Feng</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+F">Fei Tian</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+F">Feiyu Shen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jingbei Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mingrui Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peng Liu</a>, <a href="/search/eess?searchtype=author&query=Miao%2C+R">Ruihang Miao</a>, <a href="/search/eess?searchtype=author&query=You%2C+W">Wang You</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xuerui Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yechang Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuxiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zixin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Hongyu Zhou</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jianjian Sun</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Brian Li</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+C">Chengting Feng</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+C">Changyi Wan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+H">Hanpeng Hu</a> , et al. (120 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11946v2-abstract-short" style="display: inline;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'inline'; document.getElementById('2502.11946v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11946v2-abstract-full" style="display: none;"> Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies. Our code and models are available at https://github.com/stepfun-ai/Step-Audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11946v2-abstract-full').style.display = 'none'; document.getElementById('2502.11946v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06490">arXiv:2502.06490</a> <span> [<a href="https://arxiv.org/pdf/2502.06490">pdf</a>, <a href="https://arxiv.org/format/2502.06490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Recent Advances in Discrete Speech Tokens: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yiwei Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhihan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hankun Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bohan Li</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+C">Chongtian Shao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanglei Zhang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+C">Chenpeng Du</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xie Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shujie Liu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06490v2-abstract-short" style="display: inline;"> The rapid advancement of speech generation technologies in the era of large language models (LLMs) has established discrete speech tokens as a foundational paradigm for speech representation. These tokens, characterized by their discrete, compact, and concise nature, are not only advantageous for efficient transmission and storage, but also inherently compatible with the language modeling framewor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06490v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06490v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06490v2-abstract-full" style="display: none;"> The rapid advancement of speech generation technologies in the era of large language models (LLMs) has established discrete speech tokens as a foundational paradigm for speech representation. These tokens, characterized by their discrete, compact, and concise nature, are not only advantageous for efficient transmission and storage, but also inherently compatible with the language modeling framework, enabling seamless integration of speech into text-dominated LLM architectures. Current research categorizes discrete speech tokens into two principal classes: acoustic tokens and semantic tokens, each of which has evolved into a rich research domain characterized by unique design philosophies and methodological approaches. This survey systematically synthesizes the existing taxonomy and recent innovations in discrete speech tokenization, conducts a critical examination of the strengths and limitations of each paradigm, and presents systematic experimental comparisons across token types. Furthermore, we identify persistent challenges in the field and propose potential research directions, aiming to offer actionable insights to inspire future advancements in the development and application of discrete speech tokens. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06490v2-abstract-full').style.display = 'none'; document.getElementById('2502.06490v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures, 3 tables. Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03132">arXiv:2502.03132</a> <span> [<a href="https://arxiv.org/pdf/2502.03132">pdf</a>, <a href="https://arxiv.org/format/2502.03132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> SPARK: A Modular Benchmark for Humanoid Robot Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yifan Sun</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+R">Rui Chen</a>, <a href="/search/eess?searchtype=author&query=Yun%2C+K+S">Kai S. Yun</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yikuan Fang</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+S">Sebin Jung</a>, <a href="/search/eess?searchtype=author&query=Li%2C+F">Feihan Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowei Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Weiye Zhao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+C">Changliu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03132v1-abstract-short" style="display: inline;"> This paper introduces the Safe Protective and Assistive Robot Kit (SPARK), a comprehensive benchmark designed to ensure safety in humanoid autonomy and teleoperation. Humanoid robots pose significant safety risks due to their physical capabilities of interacting with complex environments. The physical structures of humanoid robots further add complexity to the design of general safety solutions. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03132v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03132v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03132v1-abstract-full" style="display: none;"> This paper introduces the Safe Protective and Assistive Robot Kit (SPARK), a comprehensive benchmark designed to ensure safety in humanoid autonomy and teleoperation. Humanoid robots pose significant safety risks due to their physical capabilities of interacting with complex environments. The physical structures of humanoid robots further add complexity to the design of general safety solutions. To facilitate the safe deployment of complex robot systems, SPARK can be used as a toolbox that comes with state-of-the-art safe control algorithms in a modular and composable robot control framework. Users can easily configure safety criteria and sensitivity levels to optimize the balance between safety and performance. To accelerate humanoid safety research and development, SPARK provides a simulation benchmark that compares safety approaches in a variety of environments, tasks, and robot models. Furthermore, SPARK allows quick deployment of synthesized safe controllers on real robots. For hardware deployment, SPARK supports Apple Vision Pro (AVP) or a Motion Capture System as external sensors, while also offering interfaces for seamless integration with alternative hardware setups. This paper demonstrates SPARK's capability with both simulation experiments and case studies with a Unitree G1 humanoid robot. Leveraging these advantages of SPARK, users and researchers can significantly improve the safety of their humanoid systems as well as accelerate relevant research. The open-source code is available at https://github.com/intelligent-control-lab/spark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03132v1-abstract-full').style.display = 'none'; document.getElementById('2502.03132v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01057">arXiv:2502.01057</a> <span> [<a href="https://arxiv.org/pdf/2502.01057">pdf</a>, <a href="https://arxiv.org/format/2502.01057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FetDTIAlign: A Deep Learning Framework for Affine and Deformable Registration of Fetal Brain dMRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Q">Qi Zeng</a>, <a href="/search/eess?searchtype=author&query=Warfield%2C+S+K">Simon K. Warfield</a>, <a href="/search/eess?searchtype=author&query=Karimi%2C+D">Davood Karimi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01057v3-abstract-short" style="display: inline;"> Diffusion MRI (dMRI) provides unique insights into fetal brain microstructure in utero. Longitudinal and cross-sectional fetal dMRI studies can reveal crucial neurodevelopmental changes but require precise spatial alignment across scans and subjects. This is challenging due to low data quality, rapid brain development, and limited anatomical landmarks. Existing registration methods, designed for h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01057v3-abstract-full').style.display = 'inline'; document.getElementById('2502.01057v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01057v3-abstract-full" style="display: none;"> Diffusion MRI (dMRI) provides unique insights into fetal brain microstructure in utero. Longitudinal and cross-sectional fetal dMRI studies can reveal crucial neurodevelopmental changes but require precise spatial alignment across scans and subjects. This is challenging due to low data quality, rapid brain development, and limited anatomical landmarks. Existing registration methods, designed for high-quality adult data, struggle with these complexities. To address this, we introduce FetDTIAlign, a deep learning approach for fetal brain dMRI registration, enabling accurate affine and deformable alignment. FetDTIAlign features a dual-encoder architecture and iterative feature-based inference, reducing the impact of noise and low resolution. It optimizes network configurations and domain-specific features at each registration stage, enhancing both robustness and accuracy. We validated FetDTIAlign on data from 23 to 36 weeks gestation, covering 60 white matter tracts. It consistently outperformed two classical optimization-based methods and a deep learning pipeline, achieving superior anatomical correspondence. Further validation on external data from the Developing Human Connectome Project confirmed its generalizability across acquisition protocols. Our results demonstrate the feasibility of deep learning for fetal brain dMRI registration, providing a more accurate and reliable alternative to classical techniques. By enabling precise cross-subject and tract-specific analyses, FetDTIAlign supports new discoveries in early brain development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01057v3-abstract-full').style.display = 'none'; document.getElementById('2502.01057v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16306">arXiv:2501.16306</a> <span> [<a href="https://arxiv.org/pdf/2501.16306">pdf</a>, <a href="https://arxiv.org/format/2501.16306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Graph Neural Network Based Hybrid Beamforming Design in Wideband Terahertz MIMO-OFDM Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Beier Li</a>, <a href="/search/eess?searchtype=author&query=Vu%2C+M">Mai Vu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16306v1-abstract-short" style="display: inline;"> 6G wireless technology is projected to adopt higher and wider frequency bands, enabled by highly directional beamforming. However, the vast bandwidths available also make the impact of beam squint in massive multiple input and multiple output (MIMO) systems non-negligible. Traditional approaches such as adding a true-time-delay line (TTD) on each antenna are costly due to the massive antenna array… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16306v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16306v1-abstract-full" style="display: none;"> 6G wireless technology is projected to adopt higher and wider frequency bands, enabled by highly directional beamforming. However, the vast bandwidths available also make the impact of beam squint in massive multiple input and multiple output (MIMO) systems non-negligible. Traditional approaches such as adding a true-time-delay line (TTD) on each antenna are costly due to the massive antenna arrays required. This paper puts forth a signal processing alternative, specifically adapted to the multicarrier structure of OFDM systems, through an innovative application of Graph Neural Networks (GNNs) to optimize hybrid beamforming. By integrating two types of graph nodes to represent the analog and the digital beamforming matrices efficiently, our approach not only reduces the computational and memory burdens but also achieves high spectral efficiency performance, approaching that of all digital beamforming. The GNN runtime and memory requirement are at a fraction of the processing time and resource consumption of traditional signal processing methods, hence enabling real-time adaptation of hybrid beamforming. Furthermore, the proposed GNN exhibits strong resiliency to beam squinting, achieving almost constant spectral efficiency even as the system bandwidth increases at higher carrier frequencies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16306v1-abstract-full').style.display = 'none'; document.getElementById('2501.16306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 7 figures. This conference paper was published in the 2024 IEEE International Symposium on Phased Array Systems and Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/eess?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/eess?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/eess?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/eess?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08566">arXiv:2501.08566</a> <span> [<a href="https://arxiv.org/pdf/2501.08566">pdf</a>, <a href="https://arxiv.org/format/2501.08566">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Lightweight and Stable Zero-shot TTS with Self-distilled Representation Disentanglement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qianniu Chen</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+X">Xiaoyang Hao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yue Liu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+L">Li Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08566v1-abstract-short" style="display: inline;"> Zero-shot Text-To-Speech (TTS) synthesis shows great promise for personalized voice customization through voice cloning. However, current methods for achieving zero-shot TTS heavily rely on large model scales and extensive training datasets to ensure satisfactory performance and generalizability across various speakers. This raises concerns regarding both deployment costs and data security. In thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08566v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08566v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08566v1-abstract-full" style="display: none;"> Zero-shot Text-To-Speech (TTS) synthesis shows great promise for personalized voice customization through voice cloning. However, current methods for achieving zero-shot TTS heavily rely on large model scales and extensive training datasets to ensure satisfactory performance and generalizability across various speakers. This raises concerns regarding both deployment costs and data security. In this paper, we present a lightweight and stable zero-shot TTS system. We introduce a novel TTS architecture designed to effectively model linguistic content and various speaker attributes from source speech and prompt speech, respectively. Furthermore, we present a two-stage self-distillation framework that constructs parallel data pairs for effectively disentangling linguistic content and speakers from the perspective of training data. Extensive experiments show that our system exhibits excellent performance and superior stability on the zero-shot TTS tasks. Moreover, it shows markedly superior computational efficiency, with RTFs of 0.13 and 0.012 on the CPU and GPU, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08566v1-abstract-full').style.display = 'none'; document.getElementById('2501.08566v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages,4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08057">arXiv:2501.08057</a> <span> [<a href="https://arxiv.org/pdf/2501.08057">pdf</a>, <a href="https://arxiv.org/format/2501.08057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Optimizing Speech Multi-View Feature Fusion through Conditional Computation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shan%2C+W">Weiqiao Shan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Y">Yuchen Han</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bei Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuang Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Hao Yang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+J">Jingbo Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08057v1-abstract-short" style="display: inline;"> Recent advancements have highlighted the efficacy of self-supervised learning (SSL) features in various speech-related tasks, providing lightweight and versatile multi-view speech representations. However, our study reveals that while SSL features expedite model convergence, they conflict with traditional spectral features like FBanks in terms of update directions. In response, we propose a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08057v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08057v1-abstract-full" style="display: none;"> Recent advancements have highlighted the efficacy of self-supervised learning (SSL) features in various speech-related tasks, providing lightweight and versatile multi-view speech representations. However, our study reveals that while SSL features expedite model convergence, they conflict with traditional spectral features like FBanks in terms of update directions. In response, we propose a novel generalized feature fusion framework grounded in conditional computation, featuring a gradient-sensitive gating network and a multi-stage dropout strategy. This framework mitigates feature conflicts and bolsters model robustness to multi-view input features. By integrating SSL and spectral features, our approach accelerates convergence and maintains performance on par with spectral models across multiple speech translation tasks on the MUSTC dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08057v1-abstract-full').style.display = 'none'; document.getElementById('2501.08057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07191">arXiv:2501.07191</a> <span> [<a href="https://arxiv.org/pdf/2501.07191">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Pre-Trained Large Language Model Based Remaining Useful Life Transfer Prediction of Bearing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tao%2C+L">Laifa Tao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhengduo Zhao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuesong Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Zhan%2C+W">Wenchao Zhan</a>, <a href="/search/eess?searchtype=author&query=Su%2C+X">Xuanyuan Su</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shangyu Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qixuan Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Haifei Liu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+C">Chen Lu</a>, <a href="/search/eess?searchtype=author&query=Lian%2C+Z">Zhixuan Lian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07191v1-abstract-short" style="display: inline;"> Accurately predicting the remaining useful life (RUL) of rotating machinery, such as bearings, is essential for ensuring equipment reliability and minimizing unexpected industrial failures. Traditional data-driven deep learning methods face challenges in practical settings due to inconsistent training and testing data distributions and limited generalization for long-term predictions. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07191v1-abstract-full" style="display: none;"> Accurately predicting the remaining useful life (RUL) of rotating machinery, such as bearings, is essential for ensuring equipment reliability and minimizing unexpected industrial failures. Traditional data-driven deep learning methods face challenges in practical settings due to inconsistent training and testing data distributions and limited generalization for long-term predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07191v1-abstract-full').style.display = 'none'; document.getElementById('2501.07191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20378">arXiv:2412.20378</a> <span> [<a href="https://arxiv.org/pdf/2412.20378">pdf</a>, <a href="https://arxiv.org/format/2412.20378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Tri-Ergon: Fine-grained Video-to-Audio Generation with Multi-modal Conditions and LUFS Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingliang Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+F">Fengyu Yang</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+Y">Yuxin Mao</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Q">Qingwen Ye</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hongkai Chen</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+Y">Yiran Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20378v1-abstract-short" style="display: inline;"> Video-to-audio (V2A) generation utilizes visual-only video features to produce realistic sounds that correspond to the scene. However, current V2A models often lack fine-grained control over the generated audio, especially in terms of loudness variation and the incorporation of multi-modal conditions. To overcome these limitations, we introduce Tri-Ergon, a diffusion-based V2A model that incorpora… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20378v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20378v1-abstract-full" style="display: none;"> Video-to-audio (V2A) generation utilizes visual-only video features to produce realistic sounds that correspond to the scene. However, current V2A models often lack fine-grained control over the generated audio, especially in terms of loudness variation and the incorporation of multi-modal conditions. To overcome these limitations, we introduce Tri-Ergon, a diffusion-based V2A model that incorporates textual, auditory, and pixel-level visual prompts to enable detailed and semantically rich audio synthesis. Additionally, we introduce Loudness Units relative to Full Scale (LUFS) embedding, which allows for precise manual control of the loudness changes over time for individual audio channels, enabling our model to effectively address the intricate correlation of video and audio in real-world Foley workflows. Tri-Ergon is capable of creating 44.1 kHz high-fidelity stereo audio clips of varying lengths up to 60 seconds, which significantly outperforms existing state-of-the-art V2A methods that typically generate mono audio for a fixed duration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20378v1-abstract-full').style.display = 'none'; document.getElementById('2412.20378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025 Accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13676">arXiv:2412.13676</a> <span> [<a href="https://arxiv.org/pdf/2412.13676">pdf</a>, <a href="https://arxiv.org/ps/2412.13676">ps</a>, <a href="https://arxiv.org/format/2412.13676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust UAV Jittering and Task Scheduling in Mobile Edge Computing with Data Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xiao Zhu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junyi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13676v1-abstract-short" style="display: inline;"> Data compression technology is able to reduce data size, which can be applied to lower the cost of task offloading in mobile edge computing (MEC). This paper addresses the practical challenges for robust trajectory and scheduling optimization based on data compression in the unmanned aerial vehicle (UAV)-assisted MEC, aiming to minimize the sum energy cost of terminal users while maintaining robus… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13676v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13676v1-abstract-full" style="display: none;"> Data compression technology is able to reduce data size, which can be applied to lower the cost of task offloading in mobile edge computing (MEC). This paper addresses the practical challenges for robust trajectory and scheduling optimization based on data compression in the unmanned aerial vehicle (UAV)-assisted MEC, aiming to minimize the sum energy cost of terminal users while maintaining robust performance during UAV flight. Considering the non-convexity of the problem and the dynamic nature of the scenario, the optimization problem is reformulated as a Markov decision process. Then, a randomized ensembled double Q-learning (REDQ) algorithm is adopted to solve the issue. The algorithm allows for higher feasible update-to-data ratio, enabling more effective learning from observed data. The simulation results show that the proposed scheme effectively reduces the energy consumption while ensuring flight robustness. Compared to the PPO and A2C algorithms, energy consumption is reduced by approximately $21.9\%$ and $35.4\%$, respectively. This method demonstrates significant advantages in complex environments and holds great potential for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13676v1-abstract-full').style.display = 'none'; document.getElementById('2412.13676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.08608">arXiv:2412.08608</a> <span> [<a href="https://arxiv.org/pdf/2412.08608">pdf</a>, <a href="https://arxiv.org/format/2412.08608">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AdvWave: Stealthy Adversarial Jailbreak Attack against Large Audio-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Kang%2C+M">Mintong Kang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chejian Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.08608v1-abstract-short" style="display: inline;"> Recent advancements in large audio-language models (LALMs) have enabled speech-based user interactions, significantly enhancing user experience and accelerating the deployment of LALMs in real-world applications. However, ensuring the safety of LALMs is crucial to prevent risky outputs that may raise societal concerns or violate AI regulations. Despite the importance of this issue, research on jai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08608v1-abstract-full').style.display = 'inline'; document.getElementById('2412.08608v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.08608v1-abstract-full" style="display: none;"> Recent advancements in large audio-language models (LALMs) have enabled speech-based user interactions, significantly enhancing user experience and accelerating the deployment of LALMs in real-world applications. However, ensuring the safety of LALMs is crucial to prevent risky outputs that may raise societal concerns or violate AI regulations. Despite the importance of this issue, research on jailbreaking LALMs remains limited due to their recent emergence and the additional technical challenges they present compared to attacks on DNN-based audio models. Specifically, the audio encoders in LALMs, which involve discretization operations, often lead to gradient shattering, hindering the effectiveness of attacks relying on gradient-based optimizations. The behavioral variability of LALMs further complicates the identification of effective (adversarial) optimization targets. Moreover, enforcing stealthiness constraints on adversarial audio waveforms introduces a reduced, non-convex feasible solution space, further intensifying the challenges of the optimization process. To overcome these challenges, we develop AdvWave, the first jailbreak framework against LALMs. We propose a dual-phase optimization method that addresses gradient shattering, enabling effective end-to-end gradient-based optimization. Additionally, we develop an adaptive adversarial target search algorithm that dynamically adjusts the adversarial optimization target based on the response patterns of LALMs for specific queries. To ensure that adversarial audio remains perceptually natural to human listeners, we design a classifier-guided optimization approach that generates adversarial noise resembling common urban sounds. Extensive evaluations on multiple advanced LALMs demonstrate that AdvWave outperforms baseline methods, achieving a 40% higher average jailbreak attack success rate. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.08608v1-abstract-full').style.display = 'none'; document.getElementById('2412.08608v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03965">arXiv:2412.03965</a> <span> [<a href="https://arxiv.org/pdf/2412.03965">pdf</a>, <a href="https://arxiv.org/format/2412.03965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Offloading Revenue Maximization in Multi-UAV-Assisted Mobile Edge Computing for Video Stream </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Shan%2C+H">Huimin Shan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03965v1-abstract-short" style="display: inline;"> Traditional video transmission systems assisted by multiple Unmanned Aerial Vehicles (UAVs) are often limited by computing resources, making it challenging to meet the demands for efficient video processing. To solve this challenge, this paper presents a multi-UAV-assisted Device-to-Device (D2D) mobile edge computing system for the maximization of task offloading profits in video stream transmissi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03965v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03965v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03965v1-abstract-full" style="display: none;"> Traditional video transmission systems assisted by multiple Unmanned Aerial Vehicles (UAVs) are often limited by computing resources, making it challenging to meet the demands for efficient video processing. To solve this challenge, this paper presents a multi-UAV-assisted Device-to-Device (D2D) mobile edge computing system for the maximization of task offloading profits in video stream transmission. In particular, the system enables UAVs to collaborate with idle user devices to process video computing tasks by introducing D2D communications. To maximize the system efficiency, the paper jointly optimizes power allocation, video transcoding strategies, computing resource allocation, and UAV trajectory. The resulting non-convex optimization problem is formulated as a Markov decision process and solved relying on the Twin Delayed Deep Deterministic policy gradient (TD3) algorithm. Numerical results indicate that the proposed TD3 algorithm performs a significant advantage over other traditional algorithms in enhancing the overall system efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03965v1-abstract-full').style.display = 'none'; document.getElementById('2412.03965v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Internet of Things Journal, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.02611">arXiv:2412.02611</a> <span> [<a href="https://arxiv.org/pdf/2412.02611">pdf</a>, <a href="https://arxiv.org/format/2412.02611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand Audio-Visual Information? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gong%2C+K">Kaixiong Gong</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+K">Kaituo Feng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bohao Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yibing Wang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+M">Mofan Cheng</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+S">Shijia Yang</a>, <a href="/search/eess?searchtype=author&query=Han%2C+J">Jiaming Han</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Benyou Wang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+Y">Yutong Bai</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhuoran Yang</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+X">Xiangyu Yue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.02611v1-abstract-short" style="display: inline;"> Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini 1.5 Pro, and Reka Core, have expanded their capabilities to include vision and audio modalities. While these models demonstrate impressive performance across a wide range of audio-visual applications, our proposed DeafTest reveals that MLLMs often struggle with simple tasks humans find trivial: 1) determining which of two s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02611v1-abstract-full').style.display = 'inline'; document.getElementById('2412.02611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.02611v1-abstract-full" style="display: none;"> Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini 1.5 Pro, and Reka Core, have expanded their capabilities to include vision and audio modalities. While these models demonstrate impressive performance across a wide range of audio-visual applications, our proposed DeafTest reveals that MLLMs often struggle with simple tasks humans find trivial: 1) determining which of two sounds is louder, and 2) determining which of two sounds has a higher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a comprehensive audio-visual benchmark designed to assess whether those MLLMs can truly understand the audio-visual information. This benchmark encompasses 4,555 carefully crafted problems, each incorporating text, visual, and audio components. To successfully infer answers, models must effectively leverage clues from both visual and audio inputs. To ensure precise and objective evaluation of MLLM responses, we have structured the questions as multiple-choice, eliminating the need for human evaluation or LLM-assisted assessment. We benchmark a series of closed-source and open-source models and summarize the observations. By revealing the limitations of current models, we aim to provide useful insight for future dataset collection and model development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.02611v1-abstract-full').style.display = 'none'; document.getElementById('2412.02611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://av-odyssey.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01493">arXiv:2412.01493</a> <span> [<a href="https://arxiv.org/pdf/2412.01493">pdf</a>, <a href="https://arxiv.org/format/2412.01493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Learning Adaptive Lighting via Channel-Aware Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qirui Yang</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+P">Peng-Tao Jiang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jinwei Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+H">Huanjing Yue</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jingyu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01493v1-abstract-short" style="display: inline;"> Learning lighting adaption is a key step in obtaining a good visual perception and supporting downstream vision tasks. There are multiple light-related tasks (e.g., image retouching and exposure correction) and previous studies have mainly investigated these tasks individually. However, we observe that the light-related tasks share fundamental properties: i) different color channels have different… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01493v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01493v1-abstract-full" style="display: none;"> Learning lighting adaption is a key step in obtaining a good visual perception and supporting downstream vision tasks. There are multiple light-related tasks (e.g., image retouching and exposure correction) and previous studies have mainly investigated these tasks individually. However, we observe that the light-related tasks share fundamental properties: i) different color channels have different light properties, and ii) the channel differences reflected in the time and frequency domains are different. Based on the common light property guidance, we propose a Learning Adaptive Lighting Network (LALNet), a unified framework capable of processing different light-related tasks. Specifically, we introduce the color-separated features that emphasize the light difference of different color channels and combine them with the traditional color-mixed features by Light Guided Attention (LGA). The LGA utilizes color-separated features to guide color-mixed features focusing on channel differences and ensuring visual consistency across channels. We introduce dual domain channel modulation to generate color-separated features and a wavelet followed by a vision state space module to generate color-mixed features. Extensive experiments on four representative light-related tasks demonstrate that LALNet significantly outperforms state-of-the-art methods on benchmark tests and requires fewer computational resources. We provide an anonymous online demo at https://xxxxxx2025.github.io/LALNet/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01493v1-abstract-full').style.display = 'none'; document.getElementById('2412.01493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00877">arXiv:2412.00877</a> <span> [<a href="https://arxiv.org/pdf/2412.00877">pdf</a>, <a href="https://arxiv.org/format/2412.00877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Complexity boosted adaptive training for better low resource ASR performance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hongxuan Lu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shenjian Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Biao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00877v1-abstract-short" style="display: inline;"> During the entire training process of the ASR model, the intensity of data augmentation and the approach of calculating training loss are applied in a regulated manner based on preset parameters. For example, SpecAugment employs a predefined strength of augmentation to mask parts of the time-frequency domain spectrum. Similarly, in CTC-based multi-layer models, the loss is generally determined bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00877v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00877v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00877v1-abstract-full" style="display: none;"> During the entire training process of the ASR model, the intensity of data augmentation and the approach of calculating training loss are applied in a regulated manner based on preset parameters. For example, SpecAugment employs a predefined strength of augmentation to mask parts of the time-frequency domain spectrum. Similarly, in CTC-based multi-layer models, the loss is generally determined based on the output of the encoder's final layer during the training process. However, ignoring dynamic characteristics may suboptimally train models. To address the issue, we present a two-stage training method, known as complexity-boosted adaptive (CBA) training. It involves making dynamic adjustments to data augmentation strategies and CTC loss propagation based on the complexity of the training samples. In the first stage, we train the model with intermediate-CTC-based regularization and data augmentation without any adaptive policy. In the second stage, we propose a novel adaptive policy, called MinMax-IBF, which calculates the complexity of samples. We combine the MinMax-IBF policy to data augmentation and intermediate CTC loss regularization to continue training. The proposed CBA training approach shows considerable improvements, up to 13.4% and 14.1% relative reduction in WER on the LibriSpeech 100h test-clean and test-other dataset and also up to 6.3% relative reduction on AISHELL-1 test set, over the Conformer architecture in Wenet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00877v1-abstract-full').style.display = 'none'; document.getElementById('2412.00877v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00415">arXiv:2412.00415</a> <span> [<a href="https://arxiv.org/pdf/2412.00415">pdf</a>, <a href="https://arxiv.org/format/2412.00415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sample adaptive data augmentation with progressive scheduling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lu%2C+H">Hongxuan Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Biao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00415v1-abstract-short" style="display: inline;"> Data augmentation is a widely adopted technique utilized to improve the robustness of automatic speech recognition (ASR). Employing a fixed data augmentation strategy for all training data is a common practice. However, it is important to note that there can be variations in factors such as background noise, speech rate, etc. among different samples within a single training batch. By using a fixed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00415v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00415v1-abstract-full" style="display: none;"> Data augmentation is a widely adopted technique utilized to improve the robustness of automatic speech recognition (ASR). Employing a fixed data augmentation strategy for all training data is a common practice. However, it is important to note that there can be variations in factors such as background noise, speech rate, etc. among different samples within a single training batch. By using a fixed augmentation strategy, there is a risk that the model may reach a suboptimal state. In addition to the risks of employing a fixed augmentation strategy, the model's capabilities may differ across various training stages. To address these issues, this paper proposes the method of sample-adaptive data augmentation with progressive scheduling(PS-SapAug). The proposed method applies dynamic data augmentation in a two-stage training approach. It employs hybrid normalization to compute sample-specific augmentation parameters based on each sample's loss. Additionally, the probability of augmentation gradually increases throughout the training progression. Our method is evaluated on popular ASR benchmark datasets, including Aishell-1 and Librispeech-100h, achieving up to 8.13% WER reduction on LibriSpeech-100h test-clean, 6.23% on test-other, and 5.26% on AISHELL-1 test set, which demonstrate the efficacy of our approach enhancing performance and minimizing errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00415v1-abstract-full').style.display = 'none'; document.getElementById('2412.00415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05205">arXiv:2411.05205</a> <span> [<a href="https://arxiv.org/pdf/2411.05205">pdf</a>, <a href="https://arxiv.org/format/2411.05205">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Maximizing User Connectivity in AI-Enabled Multi-UAV Networks: A Distributed Strategy Generalized to Arbitrary User Distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowei Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yang Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Ran Zhang</a>, <a href="/search/eess?searchtype=author&query=Jiang"> Jiang</a>, <a href="/search/eess?searchtype=author&query=Xie"> Xie</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Miao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05205v1-abstract-short" style="display: inline;"> Deep reinforcement learning (DRL) has been extensively applied to Multi-Unmanned Aerial Vehicle (UAV) network (MUN) to effectively enable real-time adaptation to complex, time-varying environments. Nevertheless, most of the existing works assume a stationary user distribution (UD) or a dynamic one with predicted patterns. Such considerations may make the UD-specific strategies insufficient when a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05205v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05205v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05205v1-abstract-full" style="display: none;"> Deep reinforcement learning (DRL) has been extensively applied to Multi-Unmanned Aerial Vehicle (UAV) network (MUN) to effectively enable real-time adaptation to complex, time-varying environments. Nevertheless, most of the existing works assume a stationary user distribution (UD) or a dynamic one with predicted patterns. Such considerations may make the UD-specific strategies insufficient when a MUN is deployed in unknown environments. To this end, this paper investigates distributed user connectivity maximization problem in a MUN with generalization to arbitrary UDs. Specifically, the problem is first formulated into a time-coupled combinatorial nonlinear non-convex optimization with arbitrary underlying UDs. To make the optimization tractable, a multi-agent CNN-enhanced deep Q learning (MA-CDQL) algorithm is proposed. The algorithm integrates a ResNet-based CNN to the policy network to analyze the input UD in real time and obtain optimal decisions based on the extracted high-level UD features. To improve the learning efficiency and avoid local optimums, a heatmap algorithm is developed to transform the raw UD to a continuous density map. The map will be part of the true input to the policy network. Simulations are conducted to demonstrate the efficacy of UD heatmaps and the proposed algorithm in maximizing user connectivity as compared to K-means methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05205v1-abstract-full').style.display = 'none'; document.getElementById('2411.05205v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02534">arXiv:2411.02534</a> <span> [<a href="https://arxiv.org/pdf/2411.02534">pdf</a>, <a href="https://arxiv.org/format/2411.02534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Spatial Clustering for Spatial Transcriptomics Utilizing High-resolution Histology Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingjun Li</a>, <a href="/search/eess?searchtype=author&query=Karami%2C+M">Mostafa Karami</a>, <a href="/search/eess?searchtype=author&query=Junayed%2C+M+S">Masum Shah Junayed</a>, <a href="/search/eess?searchtype=author&query=Nabavi%2C+S">Sheida Nabavi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02534v1-abstract-short" style="display: inline;"> Understanding the intricate cellular environment within biological tissues is crucial for uncovering insights into complex biological functions. While single-cell RNA sequencing has significantly enhanced our understanding of cellular states, it lacks the spatial context necessary to fully comprehend the cellular environment. Spatial transcriptomics (ST) addresses this limitation by enabling trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02534v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02534v1-abstract-full" style="display: none;"> Understanding the intricate cellular environment within biological tissues is crucial for uncovering insights into complex biological functions. While single-cell RNA sequencing has significantly enhanced our understanding of cellular states, it lacks the spatial context necessary to fully comprehend the cellular environment. Spatial transcriptomics (ST) addresses this limitation by enabling transcriptome-wide gene expression profiling while preserving spatial context. One of the principal challenges in ST data analysis is spatial clustering, which reveals spatial domains based on the spots within a tissue. Modern ST sequencing procedures typically include a high-resolution histology image, which has been shown in previous studies to be closely connected to gene expression profiles. However, current spatial clustering methods often fail to fully integrate high-resolution histology image features with gene expression data, limiting their ability to capture critical spatial and cellular interactions. In this study, we propose the spatial transcriptomics multi-modal clustering (stMMC) model, a novel contrastive learning-based deep learning approach that integrates gene expression data with histology image features through a multi-modal parallel graph autoencoder. We tested stMMC against four state-of-the-art baseline models: Leiden, GraphST, SpaGCN, and stLearn on two public ST datasets with 13 sample slices in total. The experiments demonstrated that stMMC outperforms all the baseline models in terms of ARI and NMI. An ablation study further validated the contributions of contrastive learning and the incorporation of histology image features. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02534v1-abstract-full').style.display = 'none'; document.getElementById('2411.02534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.3; I.2.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02038">arXiv:2411.02038</a> <span> [<a href="https://arxiv.org/pdf/2411.02038">pdf</a>, <a href="https://arxiv.org/format/2411.02038">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Addressing Representation Collapse in Vector Quantized Models with One Linear Layer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yongxin Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bocheng Li</a>, <a href="/search/eess?searchtype=author&query=Xin%2C+Y">Yifei Xin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Linli Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02038v1-abstract-short" style="display: inline;"> Vector Quantization (VQ) is a widely used method for converting continuous representations into discrete codes, which has become fundamental in unsupervised representation learning and latent generative models. However, VQ models are often hindered by the problem of representation collapse in the latent space, which leads to low codebook utilization and limits the scalability of the codebook for l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02038v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02038v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02038v1-abstract-full" style="display: none;"> Vector Quantization (VQ) is a widely used method for converting continuous representations into discrete codes, which has become fundamental in unsupervised representation learning and latent generative models. However, VQ models are often hindered by the problem of representation collapse in the latent space, which leads to low codebook utilization and limits the scalability of the codebook for large-scale training. Existing methods designed to mitigate representation collapse typically reduce the dimensionality of latent space at the expense of model capacity, which do not fully resolve the core issue. In this study, we conduct a theoretical analysis of representation collapse in VQ models and identify its primary cause as the disjoint optimization of the codebook, where only a small subset of code vectors are updated through gradient descent. To address this issue, we propose \textbf{SimVQ}, a novel method which reparameterizes the code vectors through a linear transformation layer based on a learnable latent basis. This transformation optimizes the \textit{entire linear space} spanned by the codebook, rather than merely updating \textit{the code vector} selected by the nearest-neighbor search in vanilla VQ models. Although it is commonly understood that the multiplication of two linear matrices is equivalent to applying a single linear layer, our approach works surprisingly well in resolving the collapse issue in VQ models with just one linear layer. We validate the efficacy of SimVQ through extensive experiments across various modalities, including image and audio data with different model architectures. Our code is available at \url{https://github.com/youngsheen/SimVQ}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02038v1-abstract-full').style.display = 'none'; document.getElementById('2411.02038v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22732">arXiv:2410.22732</a> <span> [<a href="https://arxiv.org/pdf/2410.22732">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> st-DTPM: Spatial-Temporal Guided Diffusion Transformer Probabilistic Model for Delayed Scan PET Image Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hong%2C+R">Ran Hong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yuxia Huang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+L">Lei Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhonghui Wu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingxuan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuemei Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qiegen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22732v1-abstract-short" style="display: inline;"> PET imaging is widely employed for observing biological metabolic activities within the human body. However, numerous benign conditions can cause increased uptake of radiopharmaceuticals, confounding differentiation from malignant tumors. Several studies have indicated that dual-time PET imaging holds promise in distinguishing between malignant and benign tumor processes. Nevertheless, the hour-lo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22732v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22732v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22732v1-abstract-full" style="display: none;"> PET imaging is widely employed for observing biological metabolic activities within the human body. However, numerous benign conditions can cause increased uptake of radiopharmaceuticals, confounding differentiation from malignant tumors. Several studies have indicated that dual-time PET imaging holds promise in distinguishing between malignant and benign tumor processes. Nevertheless, the hour-long distribution period of radiopharmaceuticals post-injection complicates the determination of optimal timing for the second scan, presenting challenges in both practical applications and research. Notably, we have identified that delay time PET imaging can be framed as an image-to-image conversion problem. Motivated by this insight, we propose a novel spatial-temporal guided diffusion transformer probabilistic model (st-DTPM) to solve dual-time PET imaging prediction problem. Specifically, this architecture leverages the U-net framework that integrates patch-wise features of CNN and pixel-wise relevance of Transformer to obtain local and global information. And then employs a conditional DDPM model for image synthesis. Furthermore, on spatial condition, we concatenate early scan PET images and noisy PET images on every denoising step to guide the spatial distribution of denoising sampling. On temporal condition, we convert diffusion time steps and delay time to a universal time vector, then embed it to each layer of model architecture to further improve the accuracy of predictions. Experimental results demonstrated the superiority of our method over alternative approaches in preserving image quality and structural information, thereby affirming its efficacy in predictive task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22732v1-abstract-full').style.display = 'none'; document.getElementById('2410.22732v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21951">arXiv:2410.21951</a> <span> [<a href="https://arxiv.org/pdf/2410.21951">pdf</a>, <a href="https://arxiv.org/format/2410.21951">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Fast and High-Quality Auto-Regressive Speech Synthesis via Speculative Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bohan Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hankun Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Situo Zhang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yiwei Guo</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+K">Kai Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21951v2-abstract-short" style="display: inline;"> The auto-regressive architecture, like GPTs, is widely used in modern Text-to-Speech (TTS) systems. However, it incurs substantial inference time, particularly due to the challenges in the next-token prediction posed by lengthy sequences of speech tokens. In this work, we introduce VADUSA, one of the first approaches to accelerate auto-regressive TTS through speculative decoding. Our results show… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21951v2-abstract-full').style.display = 'inline'; document.getElementById('2410.21951v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21951v2-abstract-full" style="display: none;"> The auto-regressive architecture, like GPTs, is widely used in modern Text-to-Speech (TTS) systems. However, it incurs substantial inference time, particularly due to the challenges in the next-token prediction posed by lengthy sequences of speech tokens. In this work, we introduce VADUSA, one of the first approaches to accelerate auto-regressive TTS through speculative decoding. Our results show that VADUSA not only significantly improves inference speed but also enhances performance by incorporating draft heads to predict future speech content auto-regressively. Furthermore, the inclusion of a tolerance mechanism during sampling accelerates inference without compromising quality. Our approach demonstrates strong generalization across large datasets and various types of speech tokens. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21951v2-abstract-full').style.display = 'none'; document.getElementById('2410.21951v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21160">arXiv:2410.21160</a> <span> [<a href="https://arxiv.org/pdf/2410.21160">pdf</a>, <a href="https://arxiv.org/format/2410.21160">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KaLDeX: Kalman Filter based Linear Deformable Cross Attention for Retina Vessel Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhihao Zhao</a>, <a href="/search/eess?searchtype=author&query=Faghihroohi%2C+S">Shahrooz Faghihroohi</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yinzheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+J">Junjie Yang</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+S">Shipeng Zhong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kai Huang</a>, <a href="/search/eess?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Boyang Li</a>, <a href="/search/eess?searchtype=author&query=Nasseri%2C+M+A">M. Ali Nasseri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21160v1-abstract-short" style="display: inline;"> Background and Objective: In the realm of ophthalmic imaging, accurate vascular segmentation is paramount for diagnosing and managing various eye diseases. Contemporary deep learning-based vascular segmentation models rival human accuracy but still face substantial challenges in accurately segmenting minuscule blood vessels in neural network applications. Due to the necessity of multiple downsampl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21160v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21160v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21160v1-abstract-full" style="display: none;"> Background and Objective: In the realm of ophthalmic imaging, accurate vascular segmentation is paramount for diagnosing and managing various eye diseases. Contemporary deep learning-based vascular segmentation models rival human accuracy but still face substantial challenges in accurately segmenting minuscule blood vessels in neural network applications. Due to the necessity of multiple downsampling operations in the CNN models, fine details from high-resolution images are inevitably lost. The objective of this study is to design a structure to capture the delicate and small blood vessels. Methods: To address these issues, we propose a novel network (KaLDeX) for vascular segmentation leveraging a Kalman filter based linear deformable cross attention (LDCA) module, integrated within a UNet++ framework. Our approach is based on two key components: Kalman filter (KF) based linear deformable convolution (LD) and cross-attention (CA) modules. The LD module is designed to adaptively adjust the focus on thin vessels that might be overlooked in standard convolution. The CA module improves the global understanding of vascular structures by aggregating the detailed features from the LD module with the high level features from the UNet++ architecture. Finally, we adopt a topological loss function based on persistent homology to constrain the topological continuity of the segmentation. Results: The proposed method is evaluated on retinal fundus image datasets (DRIVE, CHASE_BD1, and STARE) as well as the 3mm and 6mm of the OCTA-500 dataset, achieving an average accuracy (ACC) of 97.25%, 97.77%, 97.85%, 98.89%, and 98.21%, respectively. Conclusions: Empirical evidence shows that our method outperforms the current best models on different vessel segmentation datasets. Our source code is available at: https://github.com/AIEyeSystem/KalDeX. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21160v1-abstract-full').style.display = 'none'; document.getElementById('2410.21160v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18965">arXiv:2410.18965</a> <span> [<a href="https://arxiv.org/pdf/2410.18965">pdf</a>, <a href="https://arxiv.org/format/2410.18965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> On the Crucial Role of Initialization for Matrix Factorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingcong Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/eess?searchtype=author&query=Mokhtari%2C+A">Aryan Mokhtari</a>, <a href="/search/eess?searchtype=author&query=He%2C+N">Niao He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18965v3-abstract-short" style="display: inline;"> This work revisits the classical low-rank matrix factorization problem and unveils the critical role of initialization in shaping convergence rates for such nonconvex and nonsmooth optimization. We introduce Nystrom initialization, which significantly improves the global convergence of Scaled Gradient Descent (ScaledGD) in both symmetric and asymmetric matrix factorization tasks. Specifically, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18965v3-abstract-full').style.display = 'inline'; document.getElementById('2410.18965v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18965v3-abstract-full" style="display: none;"> This work revisits the classical low-rank matrix factorization problem and unveils the critical role of initialization in shaping convergence rates for such nonconvex and nonsmooth optimization. We introduce Nystrom initialization, which significantly improves the global convergence of Scaled Gradient Descent (ScaledGD) in both symmetric and asymmetric matrix factorization tasks. Specifically, we prove that ScaledGD with Nystrom initialization achieves quadratic convergence in cases where only linear rates were previously known. Furthermore, we extend this initialization to low-rank adapters (LoRA) commonly used for finetuning foundation models. Our approach, NoRA, i.e., LoRA with Nystrom initialization, demonstrates superior performance across various downstream tasks and model scales, from 1B to 7B parameters, in large language and diffusion models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18965v3-abstract-full').style.display = 'none'; document.getElementById('2410.18965v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18582">arXiv:2410.18582</a> <span> [<a href="https://arxiv.org/pdf/2410.18582">pdf</a>, <a href="https://arxiv.org/format/2410.18582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> LLM-Aided Efficient Hardware Design Automation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+K">Kangwei Xu</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+R">Ruidi Qiu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhuorui Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G+L">Grace Li Zhang</a>, <a href="/search/eess?searchtype=author&query=Schlichtmann%2C+U">Ulf Schlichtmann</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18582v1-abstract-short" style="display: inline;"> With the rapidly increasing complexity of modern chips, hardware engineers are required to invest more effort in tasks such as circuit design, verification, and physical implementation. These workflows often involve continuous modifications, which are labor-intensive and prone to errors. Therefore, there is an increasing need for more efficient and cost-effective Electronic Design Automation (EDA)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18582v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18582v1-abstract-full" style="display: none;"> With the rapidly increasing complexity of modern chips, hardware engineers are required to invest more effort in tasks such as circuit design, verification, and physical implementation. These workflows often involve continuous modifications, which are labor-intensive and prone to errors. Therefore, there is an increasing need for more efficient and cost-effective Electronic Design Automation (EDA) solutions to accelerate new hardware development. Recently, large language models (LLMs) have made significant advancements in contextual understanding, logical reasoning, and response generation. Since hardware designs and intermediate scripts can be expressed in text format, it is reasonable to explore whether integrating LLMs into EDA could simplify and fully automate the entire workflow. Accordingly, this paper discusses such possibilities in several aspects, covering hardware description language (HDL) generation, code debugging, design verification, and physical implementation. Two case studies, along with their future outlook, are introduced to highlight the capabilities of LLMs in code repair and testbench generation. Finally, future directions and challenges are highlighted to further explore the potential of LLMs in shaping the next-generation EDA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18582v1-abstract-full').style.display = 'none'; document.getElementById('2410.18582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14934">arXiv:2410.14934</a> <span> [<a href="https://arxiv.org/pdf/2410.14934">pdf</a>, <a href="https://arxiv.org/format/2410.14934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Development of a Simple and Novel Digital Twin Framework for Industrial Robots in Intelligent robotics manufacturing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tianyi Xiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Borui Li</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+X">Xin Pan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Quan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14934v1-abstract-short" style="display: inline;"> This paper has proposed an easily replicable and novel approach for developing a Digital Twin (DT) system for industrial robots in intelligent manufacturing applications. Our framework enables effective communication via Robot Web Service (RWS), while a real-time simulation is implemented in Unity 3D and Web-based Platform without any other 3rd party tools. The framework can do real-time visualiza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14934v1-abstract-full" style="display: none;"> This paper has proposed an easily replicable and novel approach for developing a Digital Twin (DT) system for industrial robots in intelligent manufacturing applications. Our framework enables effective communication via Robot Web Service (RWS), while a real-time simulation is implemented in Unity 3D and Web-based Platform without any other 3rd party tools. The framework can do real-time visualization and control of the entire work process, as well as implement real-time path planning based on algorithms executed in MATLAB. Results verify the high communication efficiency with a refresh rate of only $17 ms$. Furthermore, our developed web-based platform and Graphical User Interface (GUI) enable easy accessibility and user-friendliness in real-time control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14934v1-abstract-full').style.display = 'none'; document.getElementById('2410.14934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 20th International Conference on Automation Science and Engineering (CASE 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14928">arXiv:2410.14928</a> <span> [<a href="https://arxiv.org/pdf/2410.14928">pdf</a>, <a href="https://arxiv.org/format/2410.14928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Novel Approach to Grasping Control of Soft Robotic Grippers based on Digital Twin </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xiang%2C+T">Tianyi Xiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Borui Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Quan Zhang</a>, <a href="/search/eess?searchtype=author&query=Leach%2C+M">Mark Leach</a>, <a href="/search/eess?searchtype=author&query=Lim%2C+E+G">Eng Gee Lim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14928v1-abstract-short" style="display: inline;"> This paper has proposed a Digital Twin (DT) framework for real-time motion and pose control of soft robotic grippers. The developed DT is based on an industrial robot workstation, integrated with our newly proposed approach for soft gripper control, primarily based on computer vision, for setting the driving pressure for desired gripper status in real-time. Knowing the gripper motion, the gripper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14928v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14928v1-abstract-full" style="display: none;"> This paper has proposed a Digital Twin (DT) framework for real-time motion and pose control of soft robotic grippers. The developed DT is based on an industrial robot workstation, integrated with our newly proposed approach for soft gripper control, primarily based on computer vision, for setting the driving pressure for desired gripper status in real-time. Knowing the gripper motion, the gripper parameters (e.g. curvatures and bending angles, etc.) are simulated by kinematics modelling in Unity 3D, which is based on four-piecewise constant curvature kinematics. The mapping in between the driving pressure and gripper parameters is achieved by implementing OpenCV based image processing algorithms and data fitting. Results show that our DT-based approach can achieve satisfactory performance in real-time control of soft gripper manipulation, which can satisfy a wide range of industrial applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14928v1-abstract-full').style.display = 'none'; document.getElementById('2410.14928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> 29th International Conference on Automation and Computing (ICAC 2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04097">arXiv:2410.04097</a> <span> [<a href="https://arxiv.org/pdf/2410.04097">pdf</a>, <a href="https://arxiv.org/format/2410.04097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TV-based Deep 3D Self Super-Resolution for fMRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=P%C3%A9rez-Bueno%2C+F">Fernando P茅rez-Bueno</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H+B">Hongwei Bran Li</a>, <a href="/search/eess?searchtype=author&query=Rosen%2C+M+S">Matthew S. Rosen</a>, <a href="/search/eess?searchtype=author&query=Nasr%2C+S">Shahin Nasr</a>, <a href="/search/eess?searchtype=author&query=Caballero-Gaudes%2C+C">Cesar Caballero-Gaudes</a>, <a href="/search/eess?searchtype=author&query=Iglesias%2C+J+E">Juan Eugenio Iglesias</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04097v2-abstract-short" style="display: inline;"> While functional Magnetic Resonance Imaging (fMRI) offers valuable insights into cognitive processes, its inherent spatial limitations pose challenges for detailed analysis of the fine-grained functional architecture of the brain. More specifically, MRI scanner and sequence specifications impose a trade-off between temporal resolution, spatial resolution, signal-to-noise ratio, and scan time. Deep… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04097v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04097v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04097v2-abstract-full" style="display: none;"> While functional Magnetic Resonance Imaging (fMRI) offers valuable insights into cognitive processes, its inherent spatial limitations pose challenges for detailed analysis of the fine-grained functional architecture of the brain. More specifically, MRI scanner and sequence specifications impose a trade-off between temporal resolution, spatial resolution, signal-to-noise ratio, and scan time. Deep Learning (DL) Super-Resolution (SR) methods have emerged as a promising solution to enhance fMRI resolution, generating high-resolution (HR) images from low-resolution (LR) images typically acquired with lower scanning times. However, most existing SR approaches depend on supervised DL techniques, which require training ground truth (GT) HR data, which is often difficult to acquire and simultaneously sets a bound for how far SR can go. In this paper, we introduce a novel self-supervised DL SR model that combines a DL network with an analytical approach and Total Variation (TV) regularization. Our method eliminates the need for external GT images, achieving competitive performance compared to supervised DL techniques and preserving the functional maps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04097v2-abstract-full').style.display = 'none'; document.getElementById('2410.04097v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint Submitted to ISBI 2025 (Accepted)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18701">arXiv:2409.18701</a> <span> [<a href="https://arxiv.org/pdf/2409.18701">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> 3DPX: Single Panoramic X-ray Analysis Guided by 3D Oral Structure Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiaoshuang Li</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zimo Huang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+M">Mingyuan Meng</a>, <a href="/search/eess?searchtype=author&query=Delamare%2C+E">Eduardo Delamare</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+D">Dagan Feng</a>, <a href="/search/eess?searchtype=author&query=Bi%2C+L">Lei Bi</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+B">Bin Sheng</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+L">Lingyong Jiang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/eess?searchtype=author&query=Kim%2C+J">Jinman Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18701v1-abstract-short" style="display: inline;"> Panoramic X-ray (PX) is a prevalent modality in dentistry practice owing to its wide availability and low cost. However, as a 2D projection of a 3D structure, PX suffers from anatomical information loss and PX diagnosis is limited compared to that with 3D imaging modalities. 2D-to-3D reconstruction methods have been explored for the ability to synthesize the absent 3D anatomical information from 2… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18701v1-abstract-full').style.display = 'inline'; document.getElementById('2409.18701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18701v1-abstract-full" style="display: none;"> Panoramic X-ray (PX) is a prevalent modality in dentistry practice owing to its wide availability and low cost. However, as a 2D projection of a 3D structure, PX suffers from anatomical information loss and PX diagnosis is limited compared to that with 3D imaging modalities. 2D-to-3D reconstruction methods have been explored for the ability to synthesize the absent 3D anatomical information from 2D PX for use in PX image analysis. However, there are challenges in leveraging such 3D synthesized reconstructions. First, inferring 3D depth from 2D images remains a challenging task with limited accuracy. The second challenge is the joint analysis of 2D PX with its 3D synthesized counterpart, with the aim to maximize the 2D-3D synergy while minimizing the errors arising from the synthesized image. In this study, we propose a new method termed 3DPX - PX image analysis guided by 2D-to-3D reconstruction, to overcome these challenges. 3DPX consists of (i) a novel progressive reconstruction network to improve 2D-to-3D reconstruction and, (ii) a contrastive-guided bidirectional multimodality alignment module for 3D-guided 2D PX classification and segmentation tasks. The reconstruction network progressively reconstructs 3D images with knowledge imposed on the intermediate reconstructions at multiple pyramid levels and incorporates Multilayer Perceptrons to improve semantic understanding. The downstream networks leverage the reconstructed images as 3D anatomical guidance to the PX analysis through feature alignment, which increases the 2D-3D synergy with bidirectional feature projection and decease the impact of potential errors with contrastive guidance. Extensive experiments on two oral datasets involving 464 studies demonstrate that 3DPX outperforms the state-of-the-art methods in various tasks including 2D-to-3D reconstruction, PX classification and lesion segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18701v1-abstract-full').style.display = 'none'; document.getElementById('2409.18701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17569">arXiv:2409.17569</a> <span> [<a href="https://arxiv.org/pdf/2409.17569">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A novel brain registration model combining structural and functional MRI information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Baolong Li</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+Y">Yuhu Shi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lei Wang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+W">Weiming Zeng</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+C">Changming Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17569v1-abstract-short" style="display: inline;"> Although developed functional magnetic resonance imaging (fMRI) registration algorithms based on deep learning have achieved a certain degree of alignment of functional area, they underutilized fine structural information. In this paper, we propose a semi-supervised convolutional neural network (CNN) registration model that integrates both structural and functional MRI information. The model first… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17569v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17569v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17569v1-abstract-full" style="display: none;"> Although developed functional magnetic resonance imaging (fMRI) registration algorithms based on deep learning have achieved a certain degree of alignment of functional area, they underutilized fine structural information. In this paper, we propose a semi-supervised convolutional neural network (CNN) registration model that integrates both structural and functional MRI information. The model first learns to generate deformation fields by inputting structural MRI (T1w-MRI) into the CNN to capture fine structural information. Then, we construct a local functional connectivity pattern to describe the local fMRI information, and use the Bhattacharyya coefficient to measure the similarity between two fMRI images, which is used as a loss function to facilitate the alignment of functional areas. In the inter-subject registration experiment, our model achieved an average number of voxels exceeding the threshold of 4.24 is 2248 in the group-level t-test maps for the four functional brain networks (default mode network, visual network, central executive network, and sensorimotor network). Additionally, the atlas-based registration experiment results show that the average number of voxels exceeding this threshold is 3620. The results are the largest among all methods. Our model achieves an excellent registration performance in fMRI and improves the consistency of functional regions. The proposed model has the potential to optimize fMRI image processing and analysis, facilitating the development of fMRI applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17569v1-abstract-full').style.display = 'none'; document.getElementById('2409.17569v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17139">arXiv:2409.17139</a> <span> [<a href="https://arxiv.org/pdf/2409.17139">pdf</a>, <a href="https://arxiv.org/format/2409.17139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Learning with Dynamics: Autonomous Regulation of UAV Based Communication Networks with Dynamic UAV Crew </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Ran Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowei Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Liyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Jiang"> Jiang</a>, <a href="/search/eess?searchtype=author&query=Xie"> Xie</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Miao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17139v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicle (UAV) based communication networks (UCNs) are a key component in future mobile networking. To handle the dynamic environments in UCNs, reinforcement learning (RL) has been a promising solution attributed to its strong capability of adaptive decision-making free of the environment models. However, most existing RL-based research focus on control strategy design assuming a fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17139v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17139v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicle (UAV) based communication networks (UCNs) are a key component in future mobile networking. To handle the dynamic environments in UCNs, reinforcement learning (RL) has been a promising solution attributed to its strong capability of adaptive decision-making free of the environment models. However, most existing RL-based research focus on control strategy design assuming a fixed set of UAVs. Few works have investigated how UCNs should be adaptively regulated when the serving UAVs change dynamically. This article discusses RL-based strategy design for adaptive UCN regulation given a dynamic UAV set, addressing both reactive strategies in general UCNs and proactive strategies in solar-powered UCNs. An overview of the UCN and the RL framework is first provided. Potential research directions with key challenges and possible solutions are then elaborated. Some of our recent works are presented as case studies to inspire innovative ways to handle dynamic UAV crew with different RL algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17139v1-abstract-full').style.display = 'none'; document.getElementById('2409.17139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 6 figures, magazine paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13371">arXiv:2409.13371</a> <span> [<a href="https://arxiv.org/pdf/2409.13371">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MCICSAM: Monte Carlo-guided Interpolation Consistency Segment Anything Model for Semi-Supervised Prostate Zone Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+G">Guantian Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Beibei Li</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+X">Xiaobing Fan</a>, <a href="/search/eess?searchtype=author&query=Chatterjee%2C+A">Aritrick Chatterjee</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+C">Cheng Wei</a>, <a href="/search/eess?searchtype=author&query=Qi%2C+S">Shouliang Qi</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+W">Wei Qian</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dianning He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13371v1-abstract-short" style="display: inline;"> Accurate segmentation of various regions within the prostate is pivotal for diagnosing and treating prostate-related diseases. However, the scarcity of labeled data, particularly in specialized medical fields like prostate imaging, poses a significant challenge. Segment Anything Model (SAM) is a new large model for natural image segmentation, but there are some challenges in medical imaging. In or… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13371v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13371v1-abstract-full" style="display: none;"> Accurate segmentation of various regions within the prostate is pivotal for diagnosing and treating prostate-related diseases. However, the scarcity of labeled data, particularly in specialized medical fields like prostate imaging, poses a significant challenge. Segment Anything Model (SAM) is a new large model for natural image segmentation, but there are some challenges in medical imaging. In order to better utilize the powerful feature extraction capability of SAM as well as to address the problem of low data volume for medical image annotation, we use Low-Rank Adaptation (LoRA) and semi-supervised learning methods of Monte Carlo guided interpolation consistency (MCIC) to enhance the fine-tuned SAM. We propose Monte Carlo-guided Interpolation Consistency Segment Anything Model (MCICSAM) for application to semi-supervised learning based prostate region segmentation. In the unlabeled data section, MCIC performs two different interpolation transformations on the input data and incorporates Monte Carlo uncertainty analysis in the output, forcing the model to be consistent in its predictions. The consistency constraints imposed on these interpolated samples allow the model to fit the distribution of unlabeled data better, ultimately improving its performance in semi-supervised scenarios. We use Dice and Hausdorff Distance at 95th percentile (HD95) to validate model performance. MCICSAM yieldes Dice with 79.38% and 89.95%, along with improves HD95 values of 3.12 and 2.27 for transition zone and transition zone. At the same time MCICSAM demonstrates strong generalizability. This method is expected to bring new possibilities in the field of prostate image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13371v1-abstract-full').style.display = 'none'; document.getElementById('2409.13371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12966">arXiv:2409.12966</a> <span> [<a href="https://arxiv.org/pdf/2409.12966">pdf</a>, <a href="https://arxiv.org/format/2409.12966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> An Efficient General-Purpose Optical Accelerator for Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Fei%2C+S">Sijie Fei</a>, <a href="/search/eess?searchtype=author&query=Eldebiky%2C+A">Amro Eldebiky</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+G+L">Grace Li Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bing Li</a>, <a href="/search/eess?searchtype=author&query=Schlichtmann%2C+U">Ulf Schlichtmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12966v1-abstract-short" style="display: inline;"> General-purpose optical accelerators (GOAs) have emerged as a promising platform to accelerate deep neural networks (DNNs) due to their low latency and energy consumption. Such an accelerator is usually composed of a given number of interleaving Mach-Zehnder- Interferometers (MZIs). This interleaving architecture, however, has a low efficiency when accelerating neural networks of various sizes due… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12966v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12966v1-abstract-full" style="display: none;"> General-purpose optical accelerators (GOAs) have emerged as a promising platform to accelerate deep neural networks (DNNs) due to their low latency and energy consumption. Such an accelerator is usually composed of a given number of interleaving Mach-Zehnder- Interferometers (MZIs). This interleaving architecture, however, has a low efficiency when accelerating neural networks of various sizes due to the mismatch between weight matrices and the GOA architecture. In this work, a hybrid GOA architecture is proposed to enhance the mapping efficiency of neural networks onto the GOA. In this architecture, independent MZI modules are connected with microring resonators (MRRs), so that they can be combined to process large neural networks efficiently. Each of these modules implements a unitary matrix with inputs adjusted by tunable coefficients. The parameters of the proposed architecture are searched using genetic algorithm. To enhance the accuracy of neural networks, selected weight matrices are expanded to multiple unitary matrices applying singular value decomposition (SVD). The kernels in neural networks are also adjusted to use up the on-chip computational resources. Experimental results show that with a given number of MZIs, the mapping efficiency of neural networks on the proposed architecture can be enhanced by 21.87%, 21.20%, 24.69%, and 25.52% for VGG16 and Resnet18 on datasets Cifar10 and Cifar100, respectively. The energy consumption and computation latency can also be reduced by over 67% and 21%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12966v1-abstract-full').style.display = 'none'; document.getElementById('2409.12966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ASPDAC2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09214">arXiv:2409.09214</a> <span> [<a href="https://arxiv.org/pdf/2409.09214">pdf</a>, <a href="https://arxiv.org/format/2409.09214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Seed-Music: A Unified Framework for High Quality and Controlled Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bai%2C+Y">Ye Bai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jitong Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+Y">Yi Deng</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+X">Xiaohong Dong</a>, <a href="/search/eess?searchtype=author&query=Hantrakul%2C+L">Lamtharn Hantrakul</a>, <a href="/search/eess?searchtype=author&query=Hao%2C+W">Weituo Hao</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Q">Qingqing Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhongyi Huang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+D">Dongya Jia</a>, <a href="/search/eess?searchtype=author&query=La%2C+F">Feihu La</a>, <a href="/search/eess?searchtype=author&query=Le%2C+D">Duc Le</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bochen Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chumin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hui Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xingxing Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shouda Liu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+W">Wei-Tsung Lu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Y">Yiqing Lu</a>, <a href="/search/eess?searchtype=author&query=Shaw%2C+A">Andrew Shaw</a>, <a href="/search/eess?searchtype=author&query=Spijkervet%2C+J">Janne Spijkervet</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yakun Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Ju-Chiang Wang</a> , et al. (13 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09214v3-abstract-short" style="display: inline;"> We introduce Seed-Music, a suite of music generation systems capable of producing high-quality music with fine-grained style control. Our unified framework leverages both auto-regressive language modeling and diffusion approaches to support two key music creation workflows: controlled music generation and post-production editing. For controlled music generation, our system enables vocal music gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09214v3-abstract-full').style.display = 'inline'; document.getElementById('2409.09214v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09214v3-abstract-full" style="display: none;"> We introduce Seed-Music, a suite of music generation systems capable of producing high-quality music with fine-grained style control. Our unified framework leverages both auto-regressive language modeling and diffusion approaches to support two key music creation workflows: controlled music generation and post-production editing. For controlled music generation, our system enables vocal music generation with performance controls from multi-modal inputs, including style descriptions, audio references, musical scores, and voice prompts. For post-production editing, it offers interactive tools for editing lyrics and vocal melodies directly in the generated audio. We encourage readers to listen to demo audio examples at https://team.doubao.com/seed-music "https://team.doubao.com/seed-music". <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09214v3-abstract-full').style.display = 'none'; document.getElementById('2409.09214v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Seed-Music technical report, 20 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08463">arXiv:2409.08463</a> <span> [<a href="https://arxiv.org/pdf/2409.08463">pdf</a>, <a href="https://arxiv.org/format/2409.08463">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Quality of Brain MRI Generators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jiaqi Wu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+W">Wei Peng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Binxu Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/eess?searchtype=author&query=Pohl%2C+K+M">Kilian M. Pohl</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08463v1-abstract-short" style="display: inline;"> Deep learning models generating structural brain MRIs have the potential to significantly accelerate discovery of neuroscience studies. However, their use has been limited in part by the way their quality is evaluated. Most evaluations of generative models focus on metrics originally designed for natural images (such as structural similarity index and Frechet inception distance). As we show in a c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08463v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08463v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08463v1-abstract-full" style="display: none;"> Deep learning models generating structural brain MRIs have the potential to significantly accelerate discovery of neuroscience studies. However, their use has been limited in part by the way their quality is evaluated. Most evaluations of generative models focus on metrics originally designed for natural images (such as structural similarity index and Frechet inception distance). As we show in a comparison of 6 state-of-the-art generative models trained and tested on over 3000 MRIs, these metrics are sensitive to the experimental setup and inadequately assess how well brain MRIs capture macrostructural properties of brain regions (i.e., anatomical plausibility). This shortcoming of the metrics results in inconclusive findings even when qualitative differences between the outputs of models are evident. We therefore propose a framework for evaluating models generating brain MRIs, which requires uniform processing of the real MRIs, standardizing the implementation of the models, and automatically segmenting the MRIs generated by the models. The segmentations are used for quantifying the plausibility of anatomy displayed in the MRIs. To ensure meaningful quantification, it is crucial that the segmentations are highly reliable. Our framework rigorously checks this reliability, a step often overlooked by prior work. Only 3 of the 6 generative models produced MRIs, of which at least 95% had highly reliable segmentations. More importantly, the assessment of each model by our framework is in line with qualitative assessments, reinforcing the validity of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08463v1-abstract-full').style.display = 'none'; document.getElementById('2409.08463v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted MICCAI 20224</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06010">arXiv:2409.06010</a> <span> [<a href="https://arxiv.org/pdf/2409.06010">pdf</a>, <a href="https://arxiv.org/format/2409.06010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> When Learning Meets Dynamics: Distributed User Connectivity Maximization in UAV-Based Communication Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowei Li</a>, <a href="/search/eess?searchtype=author&query=Tripathi%2C+S">Saugat Tripathi</a>, <a href="/search/eess?searchtype=author&query=Hosain%2C+S">Salman Hosain</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Ran Zhang</a>, <a href="/search/eess?searchtype=author&query=Jiang"> Jiang</a>, <a href="/search/eess?searchtype=author&query=Xie"> Xie</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Miao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06010v1-abstract-short" style="display: inline;"> Distributed management over Unmanned Aerial Vehicle (UAV) based communication networks (UCNs) has attracted increasing research attention. In this work, we study a distributed user connectivity maximization problem in a UCN. The work features a horizontal study over different levels of information exchange during the distributed iteration and a consideration of dynamics in UAV set and user distrib… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06010v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06010v1-abstract-full" style="display: none;"> Distributed management over Unmanned Aerial Vehicle (UAV) based communication networks (UCNs) has attracted increasing research attention. In this work, we study a distributed user connectivity maximization problem in a UCN. The work features a horizontal study over different levels of information exchange during the distributed iteration and a consideration of dynamics in UAV set and user distribution, which are not well addressed in the existing works. Specifically, the studied problem is first formulated into a time-coupled mixed-integer non-convex optimization problem. A heuristic two-stage UAV-user association policy is proposed to faster determine the user connectivity. To tackle the NP-hard problem in scalable manner, the distributed user connectivity maximization algorithm 1 (DUCM-1) is proposed under the multi-agent deep Q learning (MA-DQL) framework. DUCM-1 emphasizes on designing different information exchange levels and evaluating how they impact the learning convergence with stationary and dynamic user distribution. To comply with the UAV dynamics, DUCM-2 algorithm is developed which is devoted to autonomously handling arbitrary quit's and join-in's of UAVs in a considered time horizon. Extensive simulations are conducted i) to conclude that exchanging state information with a deliberated task-specific reward function design yields the best convergence performance, and ii) to show the efficacy and robustness of DUCM-2 against the dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06010v1-abstract-full').style.display = 'none'; document.getElementById('2409.06010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 12 figures, journal draft</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05020">arXiv:2409.05020</a> <span> [<a href="https://arxiv.org/pdf/2409.05020">pdf</a>, <a href="https://arxiv.org/format/2409.05020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> A Performance Bound for the Greedy Algorithm in a Generalized Class of String Optimization Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Van+Over%2C+B">Brandon Van Over</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/eess?searchtype=author&query=Chong%2C+E+K+P">Edwin K. P. Chong</a>, <a href="/search/eess?searchtype=author&query=Pezeshki%2C+A">Ali Pezeshki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05020v1-abstract-short" style="display: inline;"> We present a simple performance bound for the greedy scheme in string optimization problems that obtains strong results. Our approach vastly generalizes the group of previously established greedy curvature bounds by Conforti and Cornu茅jols (1984). We consider three constants, $伪_G$, $伪_G'$, and $伪_G''$ introduced by Conforti and Cornu茅jols (1984), that are used in performance bounds of greedy sche… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05020v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05020v1-abstract-full" style="display: none;"> We present a simple performance bound for the greedy scheme in string optimization problems that obtains strong results. Our approach vastly generalizes the group of previously established greedy curvature bounds by Conforti and Cornu茅jols (1984). We consider three constants, $伪_G$, $伪_G'$, and $伪_G''$ introduced by Conforti and Cornu茅jols (1984), that are used in performance bounds of greedy schemes in submodular set optimization. We first generalize both of the $伪_G$ and $伪_G''$ bounds to string optimization problems in a manner that includes maximizing submodular set functions over matroids as a special case. We then derive a much simpler and computable bound that allows for applications to a far more general class of functions with string domains. We prove that our bound is superior to both the $伪_G$ and $伪_G''$ bounds and provide a counterexample to show that the $伪_G'$ bound is incorrect under the assumptions in Conforti and Cornu茅jols (1984). We conclude with two applications. The first is an application of our result to sensor coverage problems. We demonstrate our performance bound in cases where the objective function is set submodular and string submodular. The second is an application to a social welfare maximization problem with black-box utility functions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05020v1-abstract-full').style.display = 'none'; document.getElementById('2409.05020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03055">arXiv:2409.03055</a> <span> [<a href="https://arxiv.org/pdf/2409.03055">pdf</a>, <a href="https://arxiv.org/format/2409.03055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SymPAC: Scalable Symbolic Music Generation With Prompts And Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Haonan Chen</a>, <a href="/search/eess?searchtype=author&query=Smith%2C+J+B+L">Jordan B. L. Smith</a>, <a href="/search/eess?searchtype=author&query=Spijkervet%2C+J">Janne Spijkervet</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Ju-Chiang Wang</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+P">Pei Zou</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bochen Li</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+Q">Qiuqiang Kong</a>, <a href="/search/eess?searchtype=author&query=Du%2C+X">Xingjian Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03055v2-abstract-short" style="display: inline;"> Progress in the task of symbolic music generation may be lagging behind other tasks like audio and text generation, in part because of the scarcity of symbolic training data. In this paper, we leverage the greater scale of audio music data by applying pre-trained MIR models (for transcription, beat tracking, structure analysis, etc.) to extract symbolic events and encode them into token sequences.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03055v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03055v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03055v2-abstract-full" style="display: none;"> Progress in the task of symbolic music generation may be lagging behind other tasks like audio and text generation, in part because of the scarcity of symbolic training data. In this paper, we leverage the greater scale of audio music data by applying pre-trained MIR models (for transcription, beat tracking, structure analysis, etc.) to extract symbolic events and encode them into token sequences. To the best of our knowledge, this work is the first to demonstrate the feasibility of training symbolic generation models solely from auto-transcribed audio data. Furthermore, to enhance the controllability of the trained model, we introduce SymPAC (Symbolic Music Language Model with Prompting And Constrained Generation), which is distinguished by using (a) prompt bars in encoding and (b) a technique called Constrained Generation via Finite State Machines (FSMs) during inference time. We show the flexibility and controllability of this approach, which may be critical in making music AI useful to creators and users. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03055v2-abstract-full').style.display = 'none'; document.getElementById('2409.03055v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ISMIR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00956">arXiv:2409.00956</a> <span> [<a href="https://arxiv.org/pdf/2409.00956">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Physics-Informed Neural Network Based Digital Image Correlation Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Boda Li</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shichao Zhou</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Q">Qinwei Ma</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+S">Shaopeng Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00956v1-abstract-short" style="display: inline;"> Digital Image Correlation (DIC) is a key technique in experimental mechanics for full-field deformation measurement, traditionally relying on subset matching to determine displacement fields. However, selecting optimal parameters like shape functions and subset size can be challenging in non-uniform deformation scenarios. Recent deep learning-based DIC approaches, both supervised and unsupervised,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00956v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00956v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00956v1-abstract-full" style="display: none;"> Digital Image Correlation (DIC) is a key technique in experimental mechanics for full-field deformation measurement, traditionally relying on subset matching to determine displacement fields. However, selecting optimal parameters like shape functions and subset size can be challenging in non-uniform deformation scenarios. Recent deep learning-based DIC approaches, both supervised and unsupervised, use neural networks to map speckle images to deformation fields, offering precise measurements without manual tuning. However, these methods require complex network architectures to extract speckle image features, which does not guarantee solution accuracy This paper introduces PINN-DIC, a novel DIC method based on Physics-Informed Neural Networks (PINNs). Unlike traditional approaches, PINN-DIC uses a simple fully connected neural network that takes the coordinate domain as input and outputs the displacement field. By integrating the DIC governing equation into the loss function, PINN-DIC directly extracts the displacement field from reference and deformed speckle images through iterative optimization. Evaluations on simulated and real experiments demonstrate that PINN-DIC maintains the accuracy of deep learning-based DIC in non-uniform fields while offering three distinct advantages: 1) enhanced precision with a simpler network by directly fitting the displacement field from coordinates, 2) effective handling of irregular boundary displacement fields with minimal parameter adjustments, and 3) easy integration with other neural network-based mechanical analysis methods for comprehensive DIC result analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00956v1-abstract-full').style.display = 'none'; document.getElementById('2409.00956v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00292">arXiv:2409.00292</a> <span> [<a href="https://arxiv.org/pdf/2409.00292">pdf</a>, <a href="https://arxiv.org/format/2409.00292">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> REFFLY: Melody-Constrained Lyrics Editing Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+S">Songyan Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bingxuan Li</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+Y">Yufei Tian</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+N">Nanyun Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00292v1-abstract-short" style="display: inline;"> Automatic melody-to-lyric generation aims to produce lyrics that align with a given melody. Although previous work can generate lyrics based on high-level control signals, such as keywords or genre, they often struggle with three challenges: (1) lack of controllability, as prior works are only able to produce lyrics from scratch, with little or no control over the content; (2) inability to generat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00292v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00292v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00292v1-abstract-full" style="display: none;"> Automatic melody-to-lyric generation aims to produce lyrics that align with a given melody. Although previous work can generate lyrics based on high-level control signals, such as keywords or genre, they often struggle with three challenges: (1) lack of controllability, as prior works are only able to produce lyrics from scratch, with little or no control over the content; (2) inability to generate fully structured songs with the desired format; and (3) failure to align prominent words in the lyrics with prominent notes in the melody, resulting in poor lyrics-melody alignment. In this work, we introduce REFFLY (REvision Framework For Lyrics), the first revision framework designed to edit arbitrary forms of plain text draft into high-quality, full-fledged song lyrics. Our approach ensures that the generated lyrics retain the original meaning of the draft, align with the melody, and adhere to the desired song structures. We demonstrate that REFFLY performs well in diverse task settings, such as lyrics revision and song translation. Experimental results show that our model outperforms strong baselines, such as Lyra (Tian et al. 2023) and GPT-4, by 25% in both musicality and text quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00292v1-abstract-full').style.display = 'none'; document.getElementById('2409.00292v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16315">arXiv:2408.16315</a> <span> [<a href="https://arxiv.org/pdf/2408.16315">pdf</a>, <a href="https://arxiv.org/format/2408.16315">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Passenger hazard perception based on EEG signals for highly automated driving vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tan%2C+A+Y+X">Ashton Yu Xuan Tan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yingkai Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaofei Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xiaorong Gao</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Sifa Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jianqiang Wang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+X">Xinyu Gu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuxin Zhang</a>, <a href="/search/eess?searchtype=author&query=Stathaki%2C+T">Tania Stathaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16315v1-abstract-short" style="display: inline;"> Enhancing the safety of autonomous vehicles is crucial, especially given recent accidents involving automated systems. As passengers in these vehicles, humans' sensory perception and decision-making can be integrated with autonomous systems to improve safety. This study explores neural mechanisms in passenger-vehicle interactions, leading to the development of a Passenger Cognitive Model (PCM) and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16315v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16315v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16315v1-abstract-full" style="display: none;"> Enhancing the safety of autonomous vehicles is crucial, especially given recent accidents involving automated systems. As passengers in these vehicles, humans' sensory perception and decision-making can be integrated with autonomous systems to improve safety. This study explores neural mechanisms in passenger-vehicle interactions, leading to the development of a Passenger Cognitive Model (PCM) and the Passenger EEG Decoding Strategy (PEDS). Central to PEDS is a novel Convolutional Recurrent Neural Network (CRNN) that captures spatial and temporal EEG data patterns. The CRNN, combined with stacking algorithms, achieves an accuracy of $85.0\% \pm 3.18\%$. Our findings highlight the predictive power of pre-event EEG data, enhancing the detection of hazardous scenarios and offering a network-driven framework for safer autonomous vehicles. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16315v1-abstract-full').style.display = 'none'; document.getElementById('2408.16315v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15887">arXiv:2408.15887</a> <span> [<a href="https://arxiv.org/pdf/2408.15887">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SpineMamba: Enhancing 3D Spinal Segmentation in Clinical Imaging through Residual Visual Mamba Layers and Shape Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhiqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Tianyong Liu</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+G">Guojia Fan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Q">Qianjin Feng</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shoujun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15887v1-abstract-short" style="display: inline;"> Accurate segmentation of 3D clinical medical images is critical in the diagnosis and treatment of spinal diseases. However, the inherent complexity of spinal anatomy and uncertainty inherent in current imaging technologies, poses significant challenges for semantic segmentation of spinal images. Although convolutional neural networks (CNNs) and Transformer-based models have made some progress in s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15887v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15887v1-abstract-full" style="display: none;"> Accurate segmentation of 3D clinical medical images is critical in the diagnosis and treatment of spinal diseases. However, the inherent complexity of spinal anatomy and uncertainty inherent in current imaging technologies, poses significant challenges for semantic segmentation of spinal images. Although convolutional neural networks (CNNs) and Transformer-based models have made some progress in spinal segmentation, their limitations in handling long-range dependencies hinder further improvements in segmentation accuracy.To address these challenges, we introduce a residual visual Mamba layer to effectively capture and model the deep semantic features and long-range spatial dependencies of 3D spinal data. To further enhance the structural semantic understanding of the vertebrae, we also propose a novel spinal shape prior module that captures specific anatomical information of the spine from medical images, significantly enhancing the model's ability to extract structural semantic information of the vertebrae. Comparative and ablation experiments on two datasets demonstrate that SpineMamba outperforms existing state-of-the-art models. On the CT dataset, the average Dice similarity coefficient for segmentation reaches as high as 94.40, while on the MR dataset, it reaches 86.95. Notably, compared to the renowned nnU-Net, SpineMamba achieves superior segmentation performance, exceeding it by up to 2 percentage points. This underscores its accuracy, robustness, and excellent generalization capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15887v1-abstract-full').style.display = 'none'; document.getElementById('2408.15887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14027">arXiv:2408.14027</a> <span> [<a href="https://arxiv.org/pdf/2408.14027">pdf</a>, <a href="https://arxiv.org/format/2408.14027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> UAV-Enabled Integrated Sensing and Communication in Maritime Emergency Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Bohan Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jiahao Liu</a>, <a href="/search/eess?searchtype=author&query=Xiong%2C+Y">Yifeng Xiong</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+J">Junsheng Mu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sheng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14027v1-abstract-short" style="display: inline;"> With line-of-sight mode deployment and fast response, unmanned aerial vehicle (UAV), equipped with the cutting-edge integrated sensing and communication (ISAC) technique, is poised to deliver high-quality communication and sensing services in maritime emergency scenarios. In practice, however, the real-time transmission of ISAC signals at the UAV side cannot be realized unless the reliable wireles… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14027v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14027v1-abstract-full" style="display: none;"> With line-of-sight mode deployment and fast response, unmanned aerial vehicle (UAV), equipped with the cutting-edge integrated sensing and communication (ISAC) technique, is poised to deliver high-quality communication and sensing services in maritime emergency scenarios. In practice, however, the real-time transmission of ISAC signals at the UAV side cannot be realized unless the reliable wireless fronthaul link between the terrestrial base station and UAV are available. This paper proposes a multicarrier-division duplex based joint fronthaul-access scheme, where mutually orthogonal subcarrier sets are leveraged to simultaneously support four types of fronthaul/access transmissions. In order to maximize the end-to-end communication rate while maintaining an adequate sensing quality-of-service (QoS) in such a complex scheme, the UAV trajectory, subcarrier assignment and power allocation are jointly optimized. The overall optimization process is designed in two stages. As the emergency area is usually far away from the coast, the optimal initial operating position for the UAV is first found. Once the UAV passes the initial operating position, the UAV's trajectory and resource allocation are optimized during the mission period to maximize the end-to-end communication rate under the constraint of minimum sensing QoS. Simulation results demonstrate the effectiveness of the proposed scheme in dealing with the joint fronthaul-access optimization problem in maritime ISAC networks, offering the advantages over benchmark schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14027v1-abstract-full').style.display = 'none'; document.getElementById('2408.14027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09933">arXiv:2408.09933</a> <span> [<a href="https://arxiv.org/pdf/2408.09933">pdf</a>, <a href="https://arxiv.org/format/2408.09933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SZU-AFS Antispoofing System for the ASVspoof 5 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yuxiong Xu</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+J">Jiafeng Zhong</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+S">Sengui Zheng</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zefeng Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09933v1-abstract-short" style="display: inline;"> This paper presents the SZU-AFS anti-spoofing system, designed for Track 1 of the ASVspoof 5 Challenge under open conditions. The system is built with four stages: selecting a baseline model, exploring effective data augmentation (DA) methods for fine-tuning, applying a co-enhancement strategy based on gradient norm aware minimization (GAM) for secondary fine-tuning, and fusing logits scores from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09933v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09933v1-abstract-full" style="display: none;"> This paper presents the SZU-AFS anti-spoofing system, designed for Track 1 of the ASVspoof 5 Challenge under open conditions. The system is built with four stages: selecting a baseline model, exploring effective data augmentation (DA) methods for fine-tuning, applying a co-enhancement strategy based on gradient norm aware minimization (GAM) for secondary fine-tuning, and fusing logits scores from the two best-performing fine-tuned models. The system utilizes the Wav2Vec2 front-end feature extractor and the AASIST back-end classifier as the baseline model. During model fine-tuning, three distinct DA policies have been investigated: single-DA, random-DA, and cascade-DA. Moreover, the employed GAM-based co-enhancement strategy, designed to fine-tune the augmented model at both data and optimizer levels, helps the Adam optimizer find flatter minima, thereby boosting model generalization. Overall, the final fusion system achieves a minDCF of 0.115 and an EER of 4.04% on the evaluation set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09933v1-abstract-full').style.display = 'none'; document.getElementById('2408.09933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, ASVspoof 5 Workshop (Interspeech2024 Satellite)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09040">arXiv:2408.09040</a> <span> [<a href="https://arxiv.org/pdf/2408.09040">pdf</a>, <a href="https://arxiv.org/format/2408.09040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> GLANCE: Graph-based Learnable Digital Twin for Communication Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Boning Li</a>, <a href="/search/eess?searchtype=author&query=Verma%2C+G">Gunjan Verma</a>, <a href="/search/eess?searchtype=author&query=Efimov%2C+T">Timofey Efimov</a>, <a href="/search/eess?searchtype=author&query=Kumar%2C+A">Abhishek Kumar</a>, <a href="/search/eess?searchtype=author&query=Segarra%2C+S">Santiago Segarra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09040v1-abstract-short" style="display: inline;"> As digital twins (DTs) to physical communication systems, network simulators can aid the design and deployment of communication networks. However, time-consuming simulations must be run for every new set of network configurations. Learnable digital twins (LDTs), in contrast, can be trained offline to emulate simulation outcomes and serve as a more efficient alternative to simulation-based DTs at r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09040v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09040v1-abstract-full" style="display: none;"> As digital twins (DTs) to physical communication systems, network simulators can aid the design and deployment of communication networks. However, time-consuming simulations must be run for every new set of network configurations. Learnable digital twins (LDTs), in contrast, can be trained offline to emulate simulation outcomes and serve as a more efficient alternative to simulation-based DTs at runtime. In this work, we propose GLANCE, a communication LDT that learns from the simulator ns-3. It can evaluate network key performance indicators (KPIs) and assist in network management with exceptional efficiency. Leveraging graph learning, we exploit network data characteristics and devise a specialized architecture to embed sequential and topological features of traffic flows within the network. In addition, multi-task learning (MTL) and transfer learning (TL) are leveraged to enhance GLANCE's generalizability to unseen inputs and efficacy across different tasks. Beyond end-to-end KPI prediction, GLANCE can be deployed within an optimization framework for network management. It serves as an efficient or differentiable evaluator in optimizing network configurations such as traffic loads and flow destinations. Through numerical experiments and benchmarking, we verify the effectiveness of the proposed LDT architecture, demonstrate its robust generalization to various inputs, and showcase its efficacy in network management applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09040v1-abstract-full').style.display = 'none'; document.getElementById('2408.09040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21611">arXiv:2407.21611</a> <span> [<a href="https://arxiv.org/pdf/2407.21611">pdf</a>, <a href="https://arxiv.org/format/2407.21611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Partially Spoofed Audio Localization with Boundary-aware Attention Mechanism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhong%2C+J">Jiafeng Zhong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Bin Li</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21611v2-abstract-short" style="display: inline;"> The task of partially spoofed audio localization aims to accurately determine audio authenticity at a frame level. Although some works have achieved encouraging results, utilizing boundary information within a single model remains an unexplored research topic. In this work, we propose a novel method called Boundary-aware Attention Mechanism (BAM). Specifically, it consists of two core modules: Bou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21611v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21611v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21611v2-abstract-full" style="display: none;"> The task of partially spoofed audio localization aims to accurately determine audio authenticity at a frame level. Although some works have achieved encouraging results, utilizing boundary information within a single model remains an unexplored research topic. In this work, we propose a novel method called Boundary-aware Attention Mechanism (BAM). Specifically, it consists of two core modules: Boundary Enhancement and Boundary Frame-wise Attention. The former assembles the intra-frame and inter-frame information to extract discriminative boundary features that are subsequently used for boundary position detection and authenticity decision, while the latter leverages boundary prediction results to explicitly control the feature interaction between frames, which achieves effective discrimination between real and fake frames. Experimental results on PartialSpoof database demonstrate our proposed method achieves the best performance. The code is available at https://github.com/media-sec-lab/BAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21611v2-abstract-full').style.display = 'none'; document.getElementById('2407.21611v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17060">arXiv:2407.17060</a> <span> [<a href="https://arxiv.org/pdf/2407.17060">pdf</a>, <a href="https://arxiv.org/format/2407.17060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> High Efficiency Image Compression for Large Visual-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+B">Binzhe Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shurun Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Y">Yan Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17060v1-abstract-short" style="display: inline;"> In recent years, large visual language models (LVLMs) have shown impressive performance and promising generalization capability in multi-modal tasks, thus replacing humans as receivers of visual information in various application scenarios. In this paper, we pioneer to propose a variable bitrate image compression framework consisting of a pre-editing module and an end-to-end codec to achieve promi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17060v1-abstract-full').style.display = 'inline'; document.getElementById('2407.17060v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17060v1-abstract-full" style="display: none;"> In recent years, large visual language models (LVLMs) have shown impressive performance and promising generalization capability in multi-modal tasks, thus replacing humans as receivers of visual information in various application scenarios. In this paper, we pioneer to propose a variable bitrate image compression framework consisting of a pre-editing module and an end-to-end codec to achieve promising rate-accuracy performance for different LVLMs. In particular, instead of optimizing an adaptive pre-editing network towards a particular task or several representative tasks, we propose a new optimization strategy tailored for LVLMs, which is designed based on the representation and discrimination capability with token-level distortion and rank. The pre-editing module and the variable bitrate end-to-end image codec are jointly trained by the losses based on semantic tokens of the large model, which introduce enhanced generalization capability for various data and tasks. {Experimental results demonstrate that the proposed framework could efficiently achieve much better rate-accuracy performance compared to the state-of-the-art coding standard, Versatile Video Coding.} Meanwhile, experiments with multi-modal tasks have revealed the robustness and generalization capability of the proposed framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17060v1-abstract-full').style.display = 'none'; document.getElementById('2407.17060v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Li%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Li%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>