Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 263 results for author: <span class="mathjax">Zhang, B</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+B">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, B"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+B&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, B"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12962">arXiv:2411.12962</a> <span> [<a href="https://arxiv.org/pdf/2411.12962">pdf</a>, <a href="https://arxiv.org/format/2411.12962">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Bring the Heat: Rapid Trajectory Optimization with Pseudospectral Techniques and the Affine Geometric Heat Flow Equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Adu%2C+C+E">Challen Enninful Adu</a>, <a href="/search/eess?searchtype=author&query=Chuquiure%2C+C+E+R">C茅sar E. Ramos Chuquiure</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bohao Zhang</a>, <a href="/search/eess?searchtype=author&query=Vasudevan%2C+R">Ram Vasudevan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12962v2-abstract-short" style="display: inline;"> Generating optimal trajectories for high-dimensional robotic systems in a time-efficient manner while adhering to constraints is a challenging task. This paper introduces PHLAME, which applies pseudospectral collocation and spatial vector algebra to efficiently solve the Affine Geometric Heat Flow (AGHF) Partial Differential Equation (PDE) for trajectory optimization. Unlike traditional PDE approa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12962v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12962v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12962v2-abstract-full" style="display: none;"> Generating optimal trajectories for high-dimensional robotic systems in a time-efficient manner while adhering to constraints is a challenging task. This paper introduces PHLAME, which applies pseudospectral collocation and spatial vector algebra to efficiently solve the Affine Geometric Heat Flow (AGHF) Partial Differential Equation (PDE) for trajectory optimization. Unlike traditional PDE approaches like the Hamilton-Jacobi-Bellman (HJB) PDE, which solve for a function over the entire state space, computing a solution to the AGHF PDE scales more efficiently because its solution is defined over a two-dimensional domain, thereby avoiding the intractability of state-space scaling. To solve the AGHF one usually applies the Method of Lines (MOL), which discretizes one variable of the AGHF PDE, and converts the PDE into a system of ordinary differential equations (ODEs) that are solved using standard time-integration methods. Though powerful, this method requires a fine discretization to generate accurate solutions and requires evaluating the AGHF PDE which is computationally expensive for high-dimensional systems. PHLAME overcomes this deficiency by using a pseudospectral method, which reduces the number of function evaluations required to yield a high accuracy solution thereby allowing it to scale efficiently to high-dimensional robotic systems. To further increase computational speed, this paper presents analytical expressions for the AGHF and its Jacobian, both of which can be computed efficiently using rigid body dynamics algorithms. PHLAME is tested across various dynamical systems, with and without obstacles and compared to a number of state-of-the-art techniques. PHLAME generates trajectories for a 44-dimensional state-space system in $\sim5$ seconds, much faster than current state-of-the-art techniques. A project page is available at https://roahmlab.github.io/PHLAME/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12962v2-abstract-full').style.display = 'none'; document.getElementById('2411.12962v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 8 figures, A project page can be found at https://roahmlab.github.io/PHLAME/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13223">arXiv:2410.13223</a> <span> [<a href="https://arxiv.org/pdf/2410.13223">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Coordinated Dispatch of Energy Storage Systems in the Active Distribution Network: A Complementary Reinforcement Learning and Optimization Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bohan Zhang</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+Z">Zhongkai Yi</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Ying Xu</a>, <a href="/search/eess?searchtype=author&query=Tu%2C+Z">Zhenghong Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13223v1-abstract-short" style="display: inline;"> The complexity and nonlinearity of active distribution network (ADN), coupled with the fast-changing renewable energy (RE), necessitate advanced real-time and safe dispatch approach. This paper proposes a complementary reinforcement learning (RL) and optimization approach, namely SA2CO, to address the coordinated dispatch of the energy storage systems (ESSs) in the ADN. The proposed approach lever… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13223v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13223v1-abstract-full" style="display: none;"> The complexity and nonlinearity of active distribution network (ADN), coupled with the fast-changing renewable energy (RE), necessitate advanced real-time and safe dispatch approach. This paper proposes a complementary reinforcement learning (RL) and optimization approach, namely SA2CO, to address the coordinated dispatch of the energy storage systems (ESSs) in the ADN. The proposed approach leverages RL's capability to make fast decision and address the model inaccuracies, while optimization methods ensure the ADN security. Furthermore, a hybrid data-driven and expert-experience auxiliary neural network is formulated as a rapid security assessment component in the SA2CO algorithm, enabling dynamic switching between RL and optimization methodologies. Simulation results demonstrate the proposed method's effectiveness and scalability in achieving real-time, safe, and economical dispatch of multiple ESSs in the ADN, surpassing the performance of the state-of-the-art RL and optimization methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13223v1-abstract-full').style.display = 'none'; document.getElementById('2410.13223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13099">arXiv:2410.13099</a> <span> [<a href="https://arxiv.org/pdf/2410.13099">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Neural Networks in Medical Imaging Advancements and Challenges in Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+H">Houze Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiang%2C+Y">Yanlin Xiang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+A">Aoran Shen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13099v1-abstract-short" style="display: inline;"> Recent advancements in artificial intelligence (AI) have precipitated a paradigm shift in medical imaging, particularly revolutionizing the domain of brain imaging. This paper systematically investigates the integration of deep learning -- a principal branch of AI -- into the semantic segmentation of brain images. Semantic segmentation serves as an indispensable technique for the delineation of di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13099v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13099v1-abstract-full" style="display: none;"> Recent advancements in artificial intelligence (AI) have precipitated a paradigm shift in medical imaging, particularly revolutionizing the domain of brain imaging. This paper systematically investigates the integration of deep learning -- a principal branch of AI -- into the semantic segmentation of brain images. Semantic segmentation serves as an indispensable technique for the delineation of discrete anatomical structures and the identification of pathological markers, essential for the diagnosis of complex neurological disorders. Historically, the reliance on manual interpretation by radiologists, while noteworthy for its accuracy, is plagued by inherent subjectivity and inter-observer variability. This limitation becomes more pronounced with the exponential increase in imaging data, which traditional methods struggle to process efficiently and effectively. In response to these challenges, this study introduces the application of adversarial neural networks, a novel AI approach that not only automates but also refines the semantic segmentation process. By leveraging these advanced neural networks, our approach enhances the precision of diagnostic outputs, reducing human error and increasing the throughput of imaging data analysis. The paper provides a detailed discussion on how adversarial neural networks facilitate a more robust, objective, and scalable solution, thereby significantly improving diagnostic accuracies in neurological evaluations. This exploration highlights the transformative impact of AI on medical imaging, setting a new benchmark for future research and clinical practice in neurology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13099v1-abstract-full').style.display = 'none'; document.getElementById('2410.13099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11296">arXiv:2410.11296</a> <span> [<a href="https://arxiv.org/pdf/2410.11296">pdf</a>, <a href="https://arxiv.org/format/2410.11296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Strategic and Fair Aggregator Interactions in Energy Markets: Mutli-agent Dynamics and Quasiconcave Games </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/eess?searchtype=author&query=Motoki%2C+M">Matt Motoki</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baosen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11296v1-abstract-short" style="display: inline;"> The introduction of aggregator structures has proven effective in bringing fairness to energy resource allocation by negotiating for more resources and economic surplus on behalf of users. This paper extends the fair energy resource allocation problem to a multi-agent setting, focusing on interactions among multiple aggregators in an electricity market. We prove that the strategic optimization p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11296v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11296v1-abstract-full" style="display: none;"> The introduction of aggregator structures has proven effective in bringing fairness to energy resource allocation by negotiating for more resources and economic surplus on behalf of users. This paper extends the fair energy resource allocation problem to a multi-agent setting, focusing on interactions among multiple aggregators in an electricity market. We prove that the strategic optimization problems faced by the aggregators form a quasiconcave game, ensuring the existence of a Nash equilibrium. This resolves complexities related to market price dependencies on total purchases and balancing fairness and efficiency in energy allocation. In addition, we design simulations to characterize the equilibrium points of the induced game, demonstrating how aggregators stabilize market outcomes, ensure fair resource distribution, and optimize user surplus. Our findings offer a robust framework for understanding strategic interactions among aggregators, contributing to more efficient and equitable energy markets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11296v1-abstract-full').style.display = 'none'; document.getElementById('2410.11296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00796">arXiv:2410.00796</a> <span> [<a href="https://arxiv.org/pdf/2410.00796">pdf</a>, <a href="https://arxiv.org/format/2410.00796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Fast and Reliable $N-k$ Contingency Screening with Input-Convex Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Christianson%2C+N">Nicolas Christianson</a>, <a href="/search/eess?searchtype=author&query=Cui%2C+W">Wenqi Cui</a>, <a href="/search/eess?searchtype=author&query=Low%2C+S">Steven Low</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Weiwei Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baosen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00796v1-abstract-short" style="display: inline;"> Power system operators must ensure that dispatch decisions remain feasible in case of grid outages or contingencies to prevent cascading failures and ensure reliable operation. However, checking the feasibility of all $N - k$ contingencies -- every possible simultaneous failure of $k$ grid components -- is computationally intractable for even small $k$, requiring system operators to resort to heur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00796v1-abstract-full').style.display = 'inline'; document.getElementById('2410.00796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00796v1-abstract-full" style="display: none;"> Power system operators must ensure that dispatch decisions remain feasible in case of grid outages or contingencies to prevent cascading failures and ensure reliable operation. However, checking the feasibility of all $N - k$ contingencies -- every possible simultaneous failure of $k$ grid components -- is computationally intractable for even small $k$, requiring system operators to resort to heuristic screening methods. Because of the increase in uncertainty and changes in system behaviors, heuristic lists might not include all relevant contingencies, generating false negatives in which unsafe scenarios are misclassified as safe. In this work, we propose to use input-convex neural networks (ICNNs) for contingency screening. We show that ICNN reliability can be determined by solving a convex optimization problem, and by scaling model weights using this problem as a differentiable optimization layer during training, we can learn an ICNN classifier that is both data-driven and has provably guaranteed reliability. Namely, our method can ensure a zero false negative rate. We empirically validate this methodology in a case study on the IEEE 39-bus test network, observing that it yields substantial (10-20x) speedups while having excellent classification accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00796v1-abstract-full').style.display = 'none'; document.getElementById('2410.00796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19716">arXiv:2409.19716</a> <span> [<a href="https://arxiv.org/pdf/2409.19716">pdf</a>, <a href="https://arxiv.org/format/2409.19716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Constrained Reinforcement Learning for Safe Heat Pump Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baohe Zhang</a>, <a href="/search/eess?searchtype=author&query=Frison%2C+L">Lilli Frison</a>, <a href="/search/eess?searchtype=author&query=Brox%2C+T">Thomas Brox</a>, <a href="/search/eess?searchtype=author&query=B%C3%B6decker%2C+J">Joschka B枚decker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19716v1-abstract-short" style="display: inline;"> Constrained Reinforcement Learning (RL) has emerged as a significant research area within RL, where integrating constraints with rewards is crucial for enhancing safety and performance across diverse control tasks. In the context of heating systems in the buildings, optimizing the energy efficiency while maintaining the residents' thermal comfort can be intuitively formulated as a constrained opti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19716v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19716v1-abstract-full" style="display: none;"> Constrained Reinforcement Learning (RL) has emerged as a significant research area within RL, where integrating constraints with rewards is crucial for enhancing safety and performance across diverse control tasks. In the context of heating systems in the buildings, optimizing the energy efficiency while maintaining the residents' thermal comfort can be intuitively formulated as a constrained optimization problem. However, to solve it with RL may require large amount of data. Therefore, an accurate and versatile simulator is favored. In this paper, we propose a novel building simulator I4B which provides interfaces for different usages and apply a model-free constrained RL algorithm named constrained Soft Actor-Critic with Linear Smoothed Log Barrier function (CSAC-LB) to the heating optimization problem. Benchmarking against baseline algorithms demonstrates CSAC-LB's efficiency in data exploration, constraint satisfaction and performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19716v1-abstract-full').style.display = 'none'; document.getElementById('2409.19716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13890">arXiv:2409.13890</a> <span> [<a href="https://arxiv.org/pdf/2409.13890">pdf</a>, <a href="https://arxiv.org/ps/2409.13890">ps</a>, <a href="https://arxiv.org/format/2409.13890">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Safe Control of Grid-Interfacing Inverters with Current Magnitude Limits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Joswig-Jones%2C+T">Trager Joswig-Jones</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baosen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13890v1-abstract-short" style="display: inline;"> Grid-interfacing inverters allow renewable resources to be connected to the electric grid and offer fast and programmable control responses. However, inverters are subject to significant physical constraints. One such constraint is a current magnitude limit required to protect semiconductor devices. While many current limiting methods are available, they can often unpredictably alter the behavior… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13890v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13890v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13890v1-abstract-full" style="display: none;"> Grid-interfacing inverters allow renewable resources to be connected to the electric grid and offer fast and programmable control responses. However, inverters are subject to significant physical constraints. One such constraint is a current magnitude limit required to protect semiconductor devices. While many current limiting methods are available, they can often unpredictably alter the behavior of the inverter control during overcurrent events leading to instability or poor performance. In this paper, we present a safety filter approach to limit the current magnitude of inverters controlled as voltage sources. The safety filter problem is formulated with a control barrier function constraint that encodes the current magnitude limit. To ensure feasibility of the problem, we prove the existence of a safe linear controller for a specified reference. This approach allows for the desired voltage source behavior to be minimally altered while safely limiting the current output. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13890v1-abstract-full').style.display = 'none'; document.getElementById('2409.13890v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, Submitted to HICSS'58</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08605">arXiv:2409.08605</a> <span> [<a href="https://arxiv.org/pdf/2409.08605">pdf</a>, <a href="https://arxiv.org/format/2409.08605">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Effective Integration of KAN for Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+A">Anfeng Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Biqiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+S">Shuyu Kong</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yiteng Huang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaojun Yang</a>, <a href="/search/eess?searchtype=author&query=Srivastava%2C+S">Sangeeta Srivastava</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Ming Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08605v1-abstract-short" style="display: inline;"> Keyword spotting (KWS) is an important speech processing component for smart devices with voice assistance capability. In this paper, we investigate if Kolmogorov-Arnold Networks (KAN) can be used to enhance the performance of KWS. We explore various approaches to integrate KAN for a model architecture based on 1D Convolutional Neural Networks (CNN). We find that KAN is effective at modeling high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08605v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08605v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08605v1-abstract-full" style="display: none;"> Keyword spotting (KWS) is an important speech processing component for smart devices with voice assistance capability. In this paper, we investigate if Kolmogorov-Arnold Networks (KAN) can be used to enhance the performance of KWS. We explore various approaches to integrate KAN for a model architecture based on 1D Convolutional Neural Networks (CNN). We find that KAN is effective at modeling high-level features in lower-dimensional spaces, resulting in improved KWS performance when integrated appropriately. The findings shed light on understanding KAN for speech processing tasks and on other modalities for future researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08605v1-abstract-full').style.display = 'none'; document.getElementById('2409.08605v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.05430">arXiv:2409.05430</a> <span> [<a href="https://arxiv.org/pdf/2409.05430">pdf</a>, <a href="https://arxiv.org/format/2409.05430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Findings of the 2024 Mandarin Stuttering Event Detection and Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+B">Bin Jia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.05430v1-abstract-short" style="display: inline;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech;… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'inline'; document.getElementById('2409.05430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.05430v1-abstract-full" style="display: none;"> The StutteringSpeech Challenge focuses on advancing speech technologies for people who stutter, specifically targeting Stuttering Event Detection (SED) and Automatic Speech Recognition (ASR) in Mandarin. The challenge comprises three tracks: (1) SED, which aims to develop systems for detection of stuttering events; (2) ASR, which focuses on creating robust systems for recognizing stuttered speech; and (3) Research track for innovative approaches utilizing the provided dataset. We utilizes an open-source Mandarin stuttering dataset AS-70, which has been split into new training and test sets for the challenge. This paper presents the dataset, details the challenge tracks, and analyzes the performance of the top systems, highlighting improvements in detection accuracy and reductions in recognition error rates. Our findings underscore the potential of specialized models and augmentation strategies in developing stuttered speech technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.05430v1-abstract-full').style.display = 'none'; document.getElementById('2409.05430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 2 figures, accepted by SLT 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00099">arXiv:2409.00099</a> <span> [<a href="https://arxiv.org/pdf/2409.00099">pdf</a>, <a href="https://arxiv.org/format/2409.00099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Query-by-Example Keyword Spotting Using Spectral-Temporal Graph Attentive Pooling and Multi-Task Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhenyu Wang</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+S">Shuyu Kong</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+L">Li Wan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Biqiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yiteng Huang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+M">Mumin Jin</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Ming Sun</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+X">Xin Lei</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaojun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00099v2-abstract-short" style="display: inline;"> Existing keyword spotting (KWS) systems primarily rely on predefined keyword phrases. However, the ability to recognize customized keywords is crucial for tailoring interactions with intelligent devices. In this paper, we present a novel Query-by-Example (QbyE) KWS system that employs spectral-temporal graph attentive pooling and multi-task learning. This framework aims to effectively learn speake… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00099v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00099v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00099v2-abstract-full" style="display: none;"> Existing keyword spotting (KWS) systems primarily rely on predefined keyword phrases. However, the ability to recognize customized keywords is crucial for tailoring interactions with intelligent devices. In this paper, we present a novel Query-by-Example (QbyE) KWS system that employs spectral-temporal graph attentive pooling and multi-task learning. This framework aims to effectively learn speaker-invariant and linguistic-informative embeddings for QbyE KWS tasks. Within this framework, we investigate three distinct network architectures for encoder modeling: LiCoNet, Conformer and ECAPA_TDNN. The experimental results on a substantial internal dataset of $629$ speakers have demonstrated the effectiveness of the proposed QbyE framework in maximizing the potential of simpler models such as LiCoNet. Particularly, LiCoNet, which is 13x more efficient, achieves comparable performance to the computationally intensive Conformer model (1.98% vs. 1.63\% FRR at 0.3 FAs/Hr). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00099v2-abstract-full').style.display = 'none'; document.getElementById('2409.00099v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> INTERSPEECH 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14759">arXiv:2408.14759</a> <span> [<a href="https://arxiv.org/pdf/2408.14759">pdf</a>, <a href="https://arxiv.org/ps/2408.14759">ps</a>, <a href="https://arxiv.org/format/2408.14759">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Model Predictive Control for T-S Fuzzy Markovian Jump Systems Using Dynamic Prediction Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14759v1-abstract-short" style="display: inline;"> In this paper, the model predictive control (MPC) problem is investigated for the constrained discrete-time Takagi-Sugeno fuzzy Markovian jump systems (FMJSs) under imperfect premise matching rules. To strike a balance between initial feasible region, control performance, and online computation burden, a set of mode-dependent state feedback fuzzy controllers within the frame of dynamic prediction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14759v1-abstract-full').style.display = 'inline'; document.getElementById('2408.14759v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14759v1-abstract-full" style="display: none;"> In this paper, the model predictive control (MPC) problem is investigated for the constrained discrete-time Takagi-Sugeno fuzzy Markovian jump systems (FMJSs) under imperfect premise matching rules. To strike a balance between initial feasible region, control performance, and online computation burden, a set of mode-dependent state feedback fuzzy controllers within the frame of dynamic prediction optimizing (DPO)-MPC is delicately designed with the perturbation variables produced by the predictive dynamics. The DPO-MPC controllers are implemented via two stages: at the first stage, terminal constraints sets companied with feedback gain are obtained by solving a ``min-max'' problem; at the second stage, and a set of perturbations is designed felicitously to enlarge the feasible region. Here, dynamic feedback gains are designed for off-line using matrix factorization technique, while the dynamic controller state is determined for online over a moving horizon to gradually guide the system state from the initial feasible region to the terminal constraint set. Sufficient conditions are provided to rigorously ensure the recursive feasibility of the proposed DPO-MPC scheme and the mean-square stability of the underlying FMJS. Finally, the efficacy of the proposed methods is demonstrated through a robot arm system example. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14759v1-abstract-full').style.display = 'none'; document.getElementById('2408.14759v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13355">arXiv:2408.13355</a> <span> [<a href="https://arxiv.org/pdf/2408.13355">pdf</a>, <a href="https://arxiv.org/format/2408.13355">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Disentangled Training with Adversarial Examples For Robust Small-footprint Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhenyu Wang</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+L">Li Wan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Biqiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Yiteng Huang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shang-Wen Li</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+M">Ming Sun</a>, <a href="/search/eess?searchtype=author&query=Lei%2C+X">Xin Lei</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhaojun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13355v1-abstract-short" style="display: inline;"> A keyword spotting (KWS) engine that is continuously running on device is exposed to various speech signals that are usually unseen before. It is a challenging problem to build a small-footprint and high-performing KWS model with robustness under different acoustic environments. In this paper, we explore how to effectively apply adversarial examples to improve KWS robustness. We propose datasource… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13355v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13355v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13355v1-abstract-full" style="display: none;"> A keyword spotting (KWS) engine that is continuously running on device is exposed to various speech signals that are usually unseen before. It is a challenging problem to build a small-footprint and high-performing KWS model with robustness under different acoustic environments. In this paper, we explore how to effectively apply adversarial examples to improve KWS robustness. We propose datasource-aware disentangled learning with adversarial examples to reduce the mismatch between the original and adversarial data as well as the mismatch across original training datasources. The KWS model architecture is based on depth-wise separable convolution and a simple attention module. Experimental results demonstrate that the proposed learning strategy improves false reject rate by $40.31%$ at $1%$ false accept rate on the internal dataset, compared to the strongest baseline without using adversarial examples. Our best-performing system achieves $98.06%$ accuracy on the Google Speech Commands V1 dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13355v1-abstract-full').style.display = 'none'; document.getElementById('2408.13355v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ICASSP 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10670">arXiv:2408.10670</a> <span> [<a href="https://arxiv.org/pdf/2408.10670">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Noncontact Technique for Wave Measurement Based on Thermal Stereography and Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+D">Deyu Li</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+L">Longfei Xiao</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+H">Handi Wei</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yan Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binghua Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10670v1-abstract-short" style="display: inline;"> The accurate measurement of the wave field and its spatiotemporal evolution is essential in many hydrodynamic experiments and engineering applications. The binocular stereo imaging technique has been widely used to measure waves. However, the optical properties of indoor water surfaces, including transparency, specular reflection, and texture absence, pose challenges for image processing and stere… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10670v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10670v1-abstract-full" style="display: none;"> The accurate measurement of the wave field and its spatiotemporal evolution is essential in many hydrodynamic experiments and engineering applications. The binocular stereo imaging technique has been widely used to measure waves. However, the optical properties of indoor water surfaces, including transparency, specular reflection, and texture absence, pose challenges for image processing and stereo reconstruction. This study proposed a novel technique that combined thermal stereography and deep learning to achieve fully noncontact wave measurements. The optical imaging properties of water in the long-wave infrared spectrum were found to be suitable for stereo matching, effectively avoiding the issues in the visible-light spectrum. After capturing wave images using thermal stereo cameras, a reconstruction strategy involving deep learning techniques was proposed to improve stereo matching performance. A generative approach was employed to synthesize a dataset with ground-truth disparity from unannotated infrared images. This dataset was then fed to a pretrained stereo neural network for fine-tuning to achieve domain adaptation. Wave flume experiments were conducted to validate the feasibility and accuracy of the proposed technique. The final reconstruction results indicated great agreement and high accuracy with a mean bias of less than 2.1% compared with the measurements obtained using wave probes, suggesting that the novel technique effectively measures the spatiotemporal distribution of wave surface in hydrodynamic experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10670v1-abstract-full').style.display = 'none'; document.getElementById('2408.10670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04325">arXiv:2408.04325</a> <span> [<a href="https://arxiv.org/pdf/2408.04325">pdf</a>, <a href="https://arxiv.org/format/2408.04325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> HydraFormer: One Encoder For All Subsampling Rates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yaoxun Xu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+X">Xingchen Song</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zhiyong Wu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Z">Zhendong Peng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04325v1-abstract-short" style="display: inline;"> In automatic speech recognition, subsampling is essential for tackling diverse scenarios. However, the inadequacy of a single subsampling rate to address various real-world situations often necessitates training and deploying multiple models, consequently increasing associated costs. To address this issue, we propose HydraFormer, comprising HydraSub, a Conformer-based encoder, and a BiTransformer-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04325v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04325v1-abstract-full" style="display: none;"> In automatic speech recognition, subsampling is essential for tackling diverse scenarios. However, the inadequacy of a single subsampling rate to address various real-world situations often necessitates training and deploying multiple models, consequently increasing associated costs. To address this issue, we propose HydraFormer, comprising HydraSub, a Conformer-based encoder, and a BiTransformer-based decoder. HydraSub encompasses multiple branches, each representing a distinct subsampling rate, allowing for the flexible selection of any branch during inference based on the specific use case. HydraFormer can efficiently manage different subsampling rates, significantly reducing training and deployment expenses. Experiments on AISHELL-1 and LibriSpeech datasets reveal that HydraFormer effectively adapts to various subsampling rates and languages while maintaining high recognition performance. Additionally, HydraFormer showcases exceptional stability, sustaining consistent performance under various initialization conditions, and exhibits robust transferability by learning from pretrained single subsampling rate automatic speech recognition models\footnote{Model code and scripts: https://github.com/HydraFormer/hydraformer}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04325v1-abstract-full').style.display = 'none'; document.getElementById('2408.04325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICME 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03131">arXiv:2408.03131</a> <span> [<a href="https://arxiv.org/pdf/2408.03131">pdf</a>, <a href="https://arxiv.org/format/2408.03131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Stochastic Trajectory Optimization for Demonstration Imitation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ming%2C+C">Chenlin Ming</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zitong Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Boxuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+X">Xiaoming Duan</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jianping He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03131v2-abstract-short" style="display: inline;"> Humans often learn new skills by imitating the experts and gradually developing their proficiency. In this work, we introduce Stochastic Trajectory Optimization for Demonstration Imitation (STODI), a trajectory optimization framework for robots to imitate the shape of demonstration trajectories with improved dynamic performance. Consistent with the human learning process, demonstration imitation s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03131v2-abstract-full').style.display = 'inline'; document.getElementById('2408.03131v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03131v2-abstract-full" style="display: none;"> Humans often learn new skills by imitating the experts and gradually developing their proficiency. In this work, we introduce Stochastic Trajectory Optimization for Demonstration Imitation (STODI), a trajectory optimization framework for robots to imitate the shape of demonstration trajectories with improved dynamic performance. Consistent with the human learning process, demonstration imitation serves as an initial step, while trajectory optimization aims to enhance robot motion performance. By generating random noise and constructing proper cost functions, the STODI effectively explores and exploits generated noisy trajectories while preserving the demonstration shape characteristics. We employ three metrics to measure the similarity of trajectories in both the time and frequency domains to help with demonstration imitation. Theoretical analysis reveals relationships among these metrics, emphasizing the benefits of frequency-domain analysis for specific tasks. Experiments on a 7-DOF robotic arm in the PyBullet simulator validate the efficacy of the STODI framework, showcasing the improved optimization performance and stability compared to previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03131v2-abstract-full').style.display = 'none'; document.getElementById('2408.03131v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21479">arXiv:2407.21479</a> <span> [<a href="https://arxiv.org/pdf/2407.21479">pdf</a>, <a href="https://arxiv.org/ps/2407.21479">ps</a>, <a href="https://arxiv.org/format/2407.21479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/LWC.2024.3360053">10.1109/LWC.2024.3360053 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Air-to-Ground Cooperative OAM Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+R">Ruirui Chen</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yu Ding</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Beibei Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Song Li</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+L">Liping Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21479v2-abstract-short" style="display: inline;"> For users in hotspot region, orbital angular momentum (OAM) can realize multifold increase of spectrum efficiency (SE), and the flying base station (FBS) can rapidly support the real-time communication demand. However, the hollow divergence and alignment requirement impose crucial challenges for users to achieve air-to-ground OAM communications, where there exists the line-of-sight path. Therefore… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21479v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21479v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21479v2-abstract-full" style="display: none;"> For users in hotspot region, orbital angular momentum (OAM) can realize multifold increase of spectrum efficiency (SE), and the flying base station (FBS) can rapidly support the real-time communication demand. However, the hollow divergence and alignment requirement impose crucial challenges for users to achieve air-to-ground OAM communications, where there exists the line-of-sight path. Therefore, we propose the air-to-ground cooperative OAM communication (ACOC) scheme, which can realize OAM communications for users with size-limited devices. The waist radius is adjusted to guarantee the maximum intensity at the cooperative users (CUs). We derive the closed-form expression of the optimal FBS position, which satisfies the antenna alignment for two cooperative user groups (CUGs). Furthermore, the selection constraint is given to choose two CUGs composed of four CUs. Simulation results are provided to validate the optimal FBS position and the SE superiority of the proposed ACOC scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21479v2-abstract-full').style.display = 'none'; document.getElementById('2407.21479v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE WIRELESS COMMUNICATIONS LETTERS, VOL. 13, NO. 4, APRIL 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21478">arXiv:2407.21478</a> <span> [<a href="https://arxiv.org/pdf/2407.21478">pdf</a>, <a href="https://arxiv.org/ps/2407.21478">ps</a>, <a href="https://arxiv.org/format/2407.21478">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TBC.2023.3275363">10.1109/TBC.2023.3275363 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Precoding Based Downlink OAM-MIMO Communications with Rate Splitting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+R">Ruirui Chen</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+J">Jinyang Lin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Beibei Zhang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+Y">Yu Ding</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+K">Keyue Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21478v2-abstract-short" style="display: inline;"> Orbital angular momentum (OAM) and rate splitting (RS) are the potential key techniques for the future wireless communications. As a new orthogonal resource, OAM can achieve the multifold increase of spectrum efficiency to relieve the scarcity of the spectrum resource, but how to enhance the privacy performance imposes crucial challenge for OAM communications. RS technique divides the information… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21478v2-abstract-full').style.display = 'inline'; document.getElementById('2407.21478v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21478v2-abstract-full" style="display: none;"> Orbital angular momentum (OAM) and rate splitting (RS) are the potential key techniques for the future wireless communications. As a new orthogonal resource, OAM can achieve the multifold increase of spectrum efficiency to relieve the scarcity of the spectrum resource, but how to enhance the privacy performance imposes crucial challenge for OAM communications. RS technique divides the information into private and common parts, which can guarantee the privacies for all users. In this paper, we integrate the RS technique into downlink OAM-MIMO communications, and study the precoding optimization to maximize the sum capacity. First, the concentric uniform circular arrays (UCAs) are utilized to construct the downlink transmission framework of OAM-MIMO communications with RS. Particularly, users in the same user pair utilize RS technique to obtain the information and different user pairs use different OAM modes. Then, we derive the OAM-MIMO channel model, and formulate the sum capacity maximization problem. Finally, based on the fractional programming, the optimal precoding matrix is obtained to maximize the sum capacity by using quadratic transformation. Extensive simulation results show that by using the proposed precoding optimization algorithm, OAM-MIMO communications with RS can achieve higher sum capacity than the traditional communication schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21478v2-abstract-full').style.display = 'none'; document.getElementById('2407.21478v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE TRANSACTIONS ON BROADCASTING, VOL. 69, NO. 4, DECEMBER 2023 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13545">arXiv:2407.13545</a> <span> [<a href="https://arxiv.org/pdf/2407.13545">pdf</a>, <a href="https://arxiv.org/format/2407.13545">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiffuX2CT: Diffusion Learning to Reconstruct CT Images from Biplanar X-Rays </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xuhui Liu</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+Z">Zhi Qiao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Runkun Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hong Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Juan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/eess?searchtype=author&query=Qian%2C+Z">Zhen Qian</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13545v1-abstract-short" style="display: inline;"> Computed tomography (CT) is widely utilized in clinical settings because it delivers detailed 3D images of the human body. However, performing CT scans is not always feasible due to radiation exposure and limitations in certain surgical environments. As an alternative, reconstructing CT images from ultra-sparse X-rays offers a valuable solution and has gained significant interest in scientific res… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13545v1-abstract-full').style.display = 'inline'; document.getElementById('2407.13545v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13545v1-abstract-full" style="display: none;"> Computed tomography (CT) is widely utilized in clinical settings because it delivers detailed 3D images of the human body. However, performing CT scans is not always feasible due to radiation exposure and limitations in certain surgical environments. As an alternative, reconstructing CT images from ultra-sparse X-rays offers a valuable solution and has gained significant interest in scientific research and medical applications. However, it presents great challenges as it is inherently an ill-posed problem, often compromised by artifacts resulting from overlapping structures in X-ray images. In this paper, we propose DiffuX2CT, which models CT reconstruction from orthogonal biplanar X-rays as a conditional diffusion process. DiffuX2CT is established with a 3D global coherence denoising model with a new, implicit conditioning mechanism. We realize the conditioning mechanism by a newly designed tri-plane decoupling generator and an implicit neural decoder. By doing so, DiffuX2CT achieves structure-controllable reconstruction, which enables 3D structural information to be recovered from 2D X-rays, therefore producing faithful textures in CT images. As an extra contribution, we collect a real-world lumbar CT dataset, called LumbarV, as a new benchmark to verify the clinical significance and performance of CT reconstruction from X-rays. Extensive experiments on this dataset and three more publicly available datasets demonstrate the effectiveness of our proposal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13545v1-abstract-full').style.display = 'none'; document.getElementById('2407.13545v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.05307">arXiv:2407.05307</a> <span> [<a href="https://arxiv.org/pdf/2407.05307">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Edge-guided and Cross-scale Feature Fusion Network for Efficient Multi-contrast MRI Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhiyuan Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+Z">Zhiqiang Zeng</a>, <a href="/search/eess?searchtype=author&query=Yeo%2C+S+Y">Si Yong Yeo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.05307v2-abstract-short" style="display: inline;"> In recent years, MRI super-resolution techniques have achieved great success, especially multi-contrast methods that extract texture information from reference images to guide the super-resolution reconstruction. However, current methods primarily focus on texture similarities at the same scale, neglecting cross-scale similarities that provide comprehensive information. Moreover, the misalignment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05307v2-abstract-full').style.display = 'inline'; document.getElementById('2407.05307v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.05307v2-abstract-full" style="display: none;"> In recent years, MRI super-resolution techniques have achieved great success, especially multi-contrast methods that extract texture information from reference images to guide the super-resolution reconstruction. However, current methods primarily focus on texture similarities at the same scale, neglecting cross-scale similarities that provide comprehensive information. Moreover, the misalignment between features of different scales impedes effective aggregation of information flow. To address the limitations, we propose a novel edge-guided and cross-scale feature fusion network, namely ECFNet. Specifically, we develop a pipeline consisting of the deformable convolution and the cross-attention transformer to align features of different scales. The cross-scale fusion strategy fully integrates the texture information from different scales, significantly enhancing the super-resolution. In addition, a novel structure information collaboration module is developed to guide the super-resolution reconstruction with implicit structure priors. The structure information enables the network to focus on high-frequency components of the image, resulting in sharper details. Extensive experiments on the IXI and BraTS2020 datasets demonstrate that our method achieves state-of-the-art performance compared to other multi-contrast MRI super-resolution methods, and our method is robust in terms of different super-resolution scales. We would like to release our code and pre-trained model after the paper is accepted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.05307v2-abstract-full').style.display = 'none'; document.getElementById('2407.05307v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04675">arXiv:2407.04675</a> <span> [<a href="https://arxiv.org/pdf/2407.04675">pdf</a>, <a href="https://arxiv.org/format/2407.04675">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bai%2C+Y">Ye Bai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jingping Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Jitong Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+C">Chuang Ding</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+L">Linhao Dong</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Q">Qianqian Dong</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Y">Yujiao Du</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+K">Kepan Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+L">Lu Gao</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yi Guo</a>, <a href="/search/eess?searchtype=author&query=Han%2C+M">Minglun Han</a>, <a href="/search/eess?searchtype=author&query=Han%2C+T">Ting Han</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+W">Wenchao Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xinying Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/eess?searchtype=author&query=Hua%2C+D">Deyu Hua</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+L">Lu Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+M">Mingkun Huang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Y">Youjia Huang</a>, <a href="/search/eess?searchtype=author&query=Jin%2C+J">Jishuo Jin</a>, <a href="/search/eess?searchtype=author&query=Kong%2C+F">Fanliu Kong</a>, <a href="/search/eess?searchtype=author&query=Lan%2C+Z">Zongwei Lan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tianyu Li</a> , et al. (30 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04675v2-abstract-short" style="display: inline;"> Modern automatic speech recognition (ASR) model is required to accurately transcribe diverse speech signals (from different domains, languages, accents, etc) given the specific contextual information in various application scenarios. Classic end-to-end models fused with extra language models perform well, but mainly in data matching scenarios and are gradually approaching a bottleneck. In this wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04675v2-abstract-full').style.display = 'inline'; document.getElementById('2407.04675v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04675v2-abstract-full" style="display: none;"> Modern automatic speech recognition (ASR) model is required to accurately transcribe diverse speech signals (from different domains, languages, accents, etc) given the specific contextual information in various application scenarios. Classic end-to-end models fused with extra language models perform well, but mainly in data matching scenarios and are gradually approaching a bottleneck. In this work, we introduce Seed-ASR, a large language model (LLM) based speech recognition model. Seed-ASR is developed based on the framework of audio conditioned LLM (AcLLM), leveraging the capabilities of LLMs by inputting continuous speech representations together with contextual information into the LLM. Through stage-wise large-scale training and the elicitation of context-aware capabilities in LLM, Seed-ASR demonstrates significant improvement over end-to-end models on comprehensive evaluation sets, including multiple domains, accents/dialects and languages. Additionally, Seed-ASR can be further deployed to support specific needs in various scenarios without requiring extra language models. Compared to recently released large ASR models, Seed-ASR achieves 10%-40% reduction in word (or character, for Chinese) error rates on Chinese and English public test sets, further demonstrating its powerful performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04675v2-abstract-full').style.display = 'none'; document.getElementById('2407.04675v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04051">arXiv:2407.04051</a> <span> [<a href="https://arxiv.org/pdf/2407.04051">pdf</a>, <a href="https://arxiv.org/format/2407.04051">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=An%2C+K">Keyu An</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Q">Qian Chen</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+C">Chong Deng</a>, <a href="/search/eess?searchtype=author&query=Du%2C+Z">Zhihao Du</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changfeng Gao</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Z">Zhifu Gao</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/eess?searchtype=author&query=He%2C+T">Ting He</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+H">Hangrui Hu</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+K">Kai Hu</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+S">Shengpeng Ji</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yabin Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zerui Li</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+H">Heng Lu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+H">Haoneng Luo</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+X">Xiang Lv</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+B">Bin Ma</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Ni%2C+C">Chongjia Ni</a>, <a href="/search/eess?searchtype=author&query=Song%2C+C">Changhe Song</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+J">Jiaqi Shi</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+X">Xian Shi</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuxuan Wang</a> , et al. (8 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04051v3-abstract-short" style="display: inline;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'inline'; document.getElementById('2407.04051v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04051v3-abstract-full" style="display: none;"> This report introduces FunAudioLLM, a model family designed to enhance natural voice interactions between humans and large language models (LLMs). At its core are two innovative models: SenseVoice, which handles multilingual speech recognition, emotion recognition, and audio event detection; and CosyVoice, which facilitates natural speech generation with control over multiple languages, timbre, speaking style, and speaker identity. SenseVoice-Small delivers exceptionally low-latency ASR for 5 languages, and SenseVoice-Large supports high-precision ASR for over 50 languages, while CosyVoice excels in multi-lingual voice generation, zero-shot in-context learning, cross-lingual voice cloning, and instruction-following capabilities. The models related to SenseVoice and CosyVoice have been open-sourced on Modelscope and Huggingface, along with the corresponding training, inference, and fine-tuning codes released on GitHub. By integrating these models with LLMs, FunAudioLLM enables applications such as speech-to-speech translation, emotional voice chat, interactive podcasts, and expressive audiobook narration, thereby pushing the boundaries of voice interaction technology. Demos are available at https://fun-audio-llm.github.io, and the code can be accessed at https://github.com/FunAudioLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04051v3-abstract-full').style.display = 'none'; document.getElementById('2407.04051v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress. Authors are listed in alphabetical order by family name</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02455">arXiv:2407.02455</a> <span> [<a href="https://arxiv.org/pdf/2407.02455">pdf</a>, <a href="https://arxiv.org/format/2407.02455">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/IoTDI61053.2024.00020">10.1109/IoTDI61053.2024.00020 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SUPER: Seated Upper Body Pose Estimation using mmWave Radars </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Z">Zimeng Zhou</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+B">Boyu Jiang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+R">Rong Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02455v1-abstract-short" style="display: inline;"> In industrial countries, adults spend a considerable amount of time sedentary each day at work, driving and during activities of daily living. Characterizing the seated upper body human poses using mmWave radars is an important, yet under-studied topic with many applications in human-machine interaction, transportation and road safety. In this work, we devise SUPER, a framework for seated upper bo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02455v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02455v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02455v1-abstract-full" style="display: none;"> In industrial countries, adults spend a considerable amount of time sedentary each day at work, driving and during activities of daily living. Characterizing the seated upper body human poses using mmWave radars is an important, yet under-studied topic with many applications in human-machine interaction, transportation and road safety. In this work, we devise SUPER, a framework for seated upper body human pose estimation that utilizes dual-mmWave radars in close proximity. A novel masking algorithm is proposed to coherently fuse data from the radars to generate intensity and Doppler point clouds with complementary information for high-motion but small radar cross section areas (e.g., upper extremities) and low-motion but large RCS areas (e.g. torso). A lightweight neural network extracts both global and local features of upper body and output pose parameters for the Skinned Multi-Person Linear (SMPL) model. Extensive leave-one-subject-out experiments on various motion sequences from multiple subjects show that SUPER outperforms a state-of-the-art baseline method by 30 -- 184%. We also demonstrate its utility in a simple downstream task for hand-object interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02455v1-abstract-full').style.display = 'none'; document.getElementById('2407.02455v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.02251">arXiv:2407.02251</a> <span> [<a href="https://arxiv.org/pdf/2407.02251">pdf</a>, <a href="https://arxiv.org/format/2407.02251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> White-Box 3D-OMP-Transformer for ISAC </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G+Y">Geoffrey Ye Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.02251v1-abstract-short" style="display: inline;"> Transformers have found broad applications for their great ability to capture long-range dependency among the inputs using attention mechanisms. The recent success of transformers increases the need for mathematical interpretation of their underlying working mechanisms, leading to the development of a family of white-box transformer-like deep network architectures. However, designing white-box tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02251v1-abstract-full').style.display = 'inline'; document.getElementById('2407.02251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.02251v1-abstract-full" style="display: none;"> Transformers have found broad applications for their great ability to capture long-range dependency among the inputs using attention mechanisms. The recent success of transformers increases the need for mathematical interpretation of their underlying working mechanisms, leading to the development of a family of white-box transformer-like deep network architectures. However, designing white-box transformers with efficient three-dimensional (3D) attention is still an open challenge. In this work, we revisit the 3D-orthogonal matching pursuit (OMP) algorithm and demonstrate that the operation of 3D-OMP is analogous to a specific kind of transformer with 3D attention. Therefore, we build a white-box 3D-OMP-transformer by introducing additional learnable parameters to 3D-OMP. As a transformer, its 3D-attention can be mathematically interpreted from 3D-OMP; while as a variant of OMP, it can learn to improve the matching pursuit process from data. Besides, a transformer's performance can be improved by stacking more transformer blocks. To simulate this process, we design a cascaded 3D-OMP-Transformer with dynamic small-scale dictionaries, which can improve the performance of the 3D-OMP-Transformer with low costs. We evaluate the designed 3D-OMP-transformer in the multi-target detection task of integrated sensing and communications (ISAC). Experimental results show that the designed 3D-OMP-Transformer can outperform current baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.02251v1-abstract-full').style.display = 'none'; document.getElementById('2407.02251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18547">arXiv:2406.18547</a> <span> [<a href="https://arxiv.org/pdf/2406.18547">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Medical Imaging with GANs Synthesizing Realistic Images from Limited Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yinqiu Feng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+L">Lingxi Xiao</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yutian Yang</a>, <a href="/search/eess?searchtype=author&query=Gegen%2C+T">Tana Gegen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zexi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18547v1-abstract-short" style="display: inline;"> In this research, we introduce an innovative method for synthesizing medical images using generative adversarial networks (GANs). Our proposed GANs method demonstrates the capability to produce realistic synthetic images even when trained on a limited quantity of real medical image data, showcasing commendable generalization prowess. To achieve this, we devised a generator and discriminator networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18547v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18547v1-abstract-full" style="display: none;"> In this research, we introduce an innovative method for synthesizing medical images using generative adversarial networks (GANs). Our proposed GANs method demonstrates the capability to produce realistic synthetic images even when trained on a limited quantity of real medical image data, showcasing commendable generalization prowess. To achieve this, we devised a generator and discriminator network architecture founded on deep convolutional neural networks (CNNs), leveraging the adversarial training paradigm for model optimization. Through extensive experimentation across diverse medical image datasets, our method exhibits robust performance, consistently generating synthetic images that closely emulate the structural and textural attributes of authentic medical images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18547v1-abstract-full').style.display = 'none'; document.getElementById('2406.18547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18327">arXiv:2406.18327</a> <span> [<a href="https://arxiv.org/pdf/2406.18327">pdf</a>, <a href="https://arxiv.org/format/2406.18327">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-modal Evidential Fusion Network for Trusted PET/CT Tumor Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Qi%2C+Y">Yuxuan Qi</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Li Lin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiajun Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jingya Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18327v1-abstract-short" style="display: inline;"> Accurate segmentation of tumors in PET/CT images is important in computer-aided diagnosis and treatment of cancer. The key issue of such a segmentation problem lies in the effective integration of complementary information from PET and CT images. However, the quality of PET and CT images varies widely in clinical settings, which leads to uncertainty in the modality information extracted by network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18327v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18327v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18327v1-abstract-full" style="display: none;"> Accurate segmentation of tumors in PET/CT images is important in computer-aided diagnosis and treatment of cancer. The key issue of such a segmentation problem lies in the effective integration of complementary information from PET and CT images. However, the quality of PET and CT images varies widely in clinical settings, which leads to uncertainty in the modality information extracted by networks. To take the uncertainty into account in multi-modal information fusion, this paper proposes a novel Multi-modal Evidential Fusion Network (MEFN) comprising a Cross-Modal Feature Learning (CFL) module and a Multi-modal Trusted Fusion (MTF) module. The CFL module reduces the domain gap upon modality conversion and highlights common tumor features, thereby alleviating the needs of the segmentation module to handle modality specificity. The MTF module utilizes mutual attention mechanisms and an uncertainty calibrator to fuse modality features based on modality uncertainty and then fuse the segmentation results under the guidance of Dempster-Shafer Theory. Besides, a new uncertainty perceptual loss is introduced to force the model focusing on uncertain features and hence improve its ability to extract trusted modality information. Extensive comparative experiments are conducted on two publicly available PET/CT datasets to evaluate the performance of our proposed method whose results demonstrate that our MEFN significantly outperforms state-of-the-art methods with improvements of 2.15% and 3.23% in DSC scores on the AutoPET dataset and the Hecktor dataset, respectively. More importantly, our model can provide radiologists with credible uncertainty of the segmentation results for their decision in accepting or rejecting the automatic segmentation results, which is particularly important for clinical applications. Our code will be available at https://github.com/QPaws/MEFN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18327v1-abstract-full').style.display = 'none'; document.getElementById('2406.18327v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07256">arXiv:2406.07256</a> <span> [<a href="https://arxiv.org/pdf/2406.07256">pdf</a>, <a href="https://arxiv.org/ps/2406.07256">ps</a>, <a href="https://arxiv.org/format/2406.07256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gong%2C+R">Rong Gong</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Lezhi Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qisheng Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+S">Shaomei Wu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Y">Yong Qin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Du%2C+J">Jun Du</a>, <a href="/search/eess?searchtype=author&query=Bin%2C+J">Jia Bin</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07256v1-abstract-short" style="display: inline;"> The rapid advancements in speech technologies over the past two decades have led to human-level performance in tasks like automatic speech recognition (ASR) for fluent speech. However, the efficacy of these models diminishes when applied to atypical speech, such as stuttering. This paper introduces AS-70, the first publicly available Mandarin stuttered speech dataset, which stands out as the large… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07256v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07256v1-abstract-full" style="display: none;"> The rapid advancements in speech technologies over the past two decades have led to human-level performance in tasks like automatic speech recognition (ASR) for fluent speech. However, the efficacy of these models diminishes when applied to atypical speech, such as stuttering. This paper introduces AS-70, the first publicly available Mandarin stuttered speech dataset, which stands out as the largest dataset in its category. Encompassing conversational and voice command reading speech, AS-70 includes verbatim manual transcription, rendering it suitable for various speech-related tasks. Furthermore, baseline systems are established, and experimental results are presented for ASR and stuttering event detection (SED) tasks. By incorporating this dataset into the model fine-tuning, significant improvements in the state-of-the-art ASR models, e.g., Whisper and Hubert, are observed, enhancing their inclusivity in addressing stuttered speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07256v1-abstract-full').style.display = 'none'; document.getElementById('2406.07256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Interspeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05763">arXiv:2406.05763</a> <span> [<a href="https://arxiv.org/pdf/2406.05763">pdf</a>, <a href="https://arxiv.org/format/2406.05763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> WenetSpeech4TTS: A 12,800-hour Mandarin TTS Corpus for Large Speech Generation Model Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ma%2C+L">Linhan Ma</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+D">Dake Guo</a>, <a href="/search/eess?searchtype=author&query=Song%2C+K">Kun Song</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yuepeng Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+L">Liumeng Xue</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+W">Weiming Xu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05763v3-abstract-short" style="display: inline;"> With the development of large text-to-speech (TTS) models and scale-up of the training data, state-of-the-art TTS systems have achieved impressive performance. In this paper, we present WenetSpeech4TTS, a multi-domain Mandarin corpus derived from the open-sourced WenetSpeech dataset. Tailored for the text-to-speech tasks, we refined WenetSpeech by adjusting segment boundaries, enhancing the audio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05763v3-abstract-full').style.display = 'inline'; document.getElementById('2406.05763v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05763v3-abstract-full" style="display: none;"> With the development of large text-to-speech (TTS) models and scale-up of the training data, state-of-the-art TTS systems have achieved impressive performance. In this paper, we present WenetSpeech4TTS, a multi-domain Mandarin corpus derived from the open-sourced WenetSpeech dataset. Tailored for the text-to-speech tasks, we refined WenetSpeech by adjusting segment boundaries, enhancing the audio quality, and eliminating speaker mixing within each segment. Following a more accurate transcription process and quality-based data filtering process, the obtained WenetSpeech4TTS corpus contains $12,800$ hours of paired audio-text data. Furthermore, we have created subsets of varying sizes, categorized by segment quality scores to allow for TTS model training and fine-tuning. VALL-E and NaturalSpeech 2 systems are trained and fine-tuned on these subsets to validate the usability of WenetSpeech4TTS, establishing baselines on benchmark for fair comparison of TTS systems. The corpus and corresponding benchmarks are publicly available on huggingface. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05763v3-abstract-full').style.display = 'none'; document.getElementById('2406.05763v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by INTERSPEECH2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.00974">arXiv:2406.00974</a> <span> [<a href="https://arxiv.org/pdf/2406.00974">pdf</a>, <a href="https://arxiv.org/format/2406.00974">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Assisted Optimal Bidding of BESS in FCAS Market: An AI-agent based Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Borui Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chaojie Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Guo Chen</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+Z">Zhaoyang Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.00974v1-abstract-short" style="display: inline;"> To incentivize flexible resources such as Battery Energy Storage Systems (BESSs) to offer Frequency Control Ancillary Services (FCAS), Australia's National Electricity Market (NEM) has implemented changes in recent years towards shorter-term bidding rules and faster service requirements. However, firstly, existing bidding optimization methods often overlook or oversimplify the key aspects of FCAS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00974v1-abstract-full').style.display = 'inline'; document.getElementById('2406.00974v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.00974v1-abstract-full" style="display: none;"> To incentivize flexible resources such as Battery Energy Storage Systems (BESSs) to offer Frequency Control Ancillary Services (FCAS), Australia's National Electricity Market (NEM) has implemented changes in recent years towards shorter-term bidding rules and faster service requirements. However, firstly, existing bidding optimization methods often overlook or oversimplify the key aspects of FCAS market procedures, resulting in an inaccurate depiction of the market bidding process. Thus, the BESS bidding problem is modeled based on the actual bidding records and the latest market specifications and then formulated as a deep reinforcement learning (DRL) problem. Secondly, the erratic decisions of the DRL agent caused by imperfectly predicted market information increases the risk of profit loss. Hence, a Conditional Value at Risk (CVaR)-based DRL algorithm is developed to enhance the risk resilience of bidding strategies. Thirdly, well-trained DRL models still face performance decline in uncommon scenarios during online operations. Therefore, a Large Language Models (LLMs)-assisted artificial intelligence (AI)-agent interactive decision-making framework is proposed to improve the strategy timeliness, reliability and interpretability in uncertain new scenarios, where conditional hybrid decision and self-reflection mechanisms are designed to address LLMs' hallucination challenge. The experiment results demonstrate that our proposed framework has higher bidding profitability compared to the baseline methods by effectively mitigating the profit loss caused by various uncertainties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.00974v1-abstract-full').style.display = 'none'; document.getElementById('2406.00974v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18782">arXiv:2405.18782</a> <span> [<a href="https://arxiv.org/pdf/2405.18782">pdf</a>, <a href="https://arxiv.org/format/2405.18782">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Principled Probabilistic Imaging using Diffusion Models as Plug-and-Play Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zihui Wu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Y">Yu Sun</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yifan Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bingliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+Y">Yisong Yue</a>, <a href="/search/eess?searchtype=author&query=Bouman%2C+K+L">Katherine L. Bouman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18782v2-abstract-short" style="display: inline;"> Diffusion models (DMs) have recently shown outstanding capabilities in modeling complex image distributions, making them expressive image priors for solving Bayesian inverse problems. However, most existing DM-based methods rely on approximations in the generative process to be generic to different inverse problems, leading to inaccurate sample distributions that deviate from the target posterior… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18782v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18782v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18782v2-abstract-full" style="display: none;"> Diffusion models (DMs) have recently shown outstanding capabilities in modeling complex image distributions, making them expressive image priors for solving Bayesian inverse problems. However, most existing DM-based methods rely on approximations in the generative process to be generic to different inverse problems, leading to inaccurate sample distributions that deviate from the target posterior defined within the Bayesian framework. To harness the generative power of DMs while avoiding such approximations, we propose a Markov chain Monte Carlo algorithm that performs posterior sampling for general inverse problems by reducing it to sampling the posterior of a Gaussian denoising problem. Crucially, we leverage a general DM formulation as a unified interface that allows for rigorously solving the denoising problem with a range of state-of-the-art DMs. We demonstrate the effectiveness of the proposed method on six inverse problems (three linear and three nonlinear), including a real-world black hole imaging problem. Experimental results indicate that our proposed method offers more accurate reconstructions and posterior estimation compared to existing DM-based imaging inverse methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18782v2-abstract-full').style.display = 'none'; document.getElementById('2405.18782v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10977">arXiv:2405.10977</a> <span> [<a href="https://arxiv.org/pdf/2405.10977">pdf</a>, <a href="https://arxiv.org/format/2405.10977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> Frequency stabilization of self-sustained oscillations in a sideband-driven electromechanical resonator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">B. Zhang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Y">Yingming Yan</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+X">X. Dong</a>, <a href="/search/eess?searchtype=author&query=Dykman%2C+M+I">M. I. Dykman</a>, <a href="/search/eess?searchtype=author&query=Chan%2C+H+B">H. B. Chan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10977v1-abstract-short" style="display: inline;"> We present a method to stabilize the frequency of self-sustained vibrations in micro- and nanomechanical resonators. The method refers to a two-mode system with the vibrations at significantly different frequencies. The signal from one mode is used to control the other mode. In the experiment, self-sustained oscillations of micromechanical modes are excited by pumping at the blue-detuned sideband… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10977v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10977v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10977v1-abstract-full" style="display: none;"> We present a method to stabilize the frequency of self-sustained vibrations in micro- and nanomechanical resonators. The method refers to a two-mode system with the vibrations at significantly different frequencies. The signal from one mode is used to control the other mode. In the experiment, self-sustained oscillations of micromechanical modes are excited by pumping at the blue-detuned sideband of the higher-frequency mode. Phase fluctuations of the two modes show near perfect anti-correlation. They can be compensated in either one of the modes by a stepwise change of the pump phase. The phase change of the controlled mode is proportional to the pump phase change, with the proportionality constant independent of the pump amplitude and frequency. This finding allows us to stabilize the phase of one mode against phase diffusion using the measured phase of the other mode. We demonstrate that phase fluctuations of either the high or low frequency mode can be significantly reduced. The results open new opportunities in generating stable vibrations in a broad frequency range via parametric downconversion in nonlinear resonators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10977v1-abstract-full').style.display = 'none'; document.getElementById('2405.10977v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02942">arXiv:2405.02942</a> <span> [<a href="https://arxiv.org/pdf/2405.02942">pdf</a>, <a href="https://arxiv.org/format/2405.02942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Design, analysis, and manufacturing of a glass-plastic hybrid minimalist aspheric panoramic annular lens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shaohua Gao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Q">Qi Jiang</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+Y">Yiqi Liao</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+Y">Yi Qiu</a>, <a href="/search/eess?searchtype=author&query=Ying%2C+W">Wanglei Ying</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kailun Yang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+K">Kaiwei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Benhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+J">Jian Bai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02942v1-abstract-short" style="display: inline;"> We propose a high-performance glass-plastic hybrid minimalist aspheric panoramic annular lens (ASPAL) to solve several major limitations of the traditional panoramic annular lens (PAL), such as large size, high weight, and complex system. The field of view (FoV) of the ASPAL is 360掳x(35掳~110掳) and the imaging quality is close to the diffraction limit. This large FoV ASPAL is composed of only 4 len… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02942v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02942v1-abstract-full" style="display: none;"> We propose a high-performance glass-plastic hybrid minimalist aspheric panoramic annular lens (ASPAL) to solve several major limitations of the traditional panoramic annular lens (PAL), such as large size, high weight, and complex system. The field of view (FoV) of the ASPAL is 360掳x(35掳~110掳) and the imaging quality is close to the diffraction limit. This large FoV ASPAL is composed of only 4 lenses. Moreover, we establish a physical structure model of PAL using the ray tracing method and study the influence of its physical parameters on compactness ratio. In addition, for the evaluation of local tolerances of annular surfaces, we propose a tolerance analysis method suitable for ASPAL. This analytical method can effectively analyze surface irregularities on annular surfaces and provide clear guidance on manufacturing tolerances for ASPAL. Benefiting from high-precision glass molding and injection molding aspheric lens manufacturing techniques, we finally manufactured 20 ASPALs in small batches. The weight of an ASPAL prototype is only 8.5 g. Our framework provides promising insights for the application of panoramic systems in space and weight-constrained environmental sensing scenarios such as intelligent security, micro-UAVs, and micro-robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02942v1-abstract-full').style.display = 'none'; document.getElementById('2405.02942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Optics & Laser Technology</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.16407">arXiv:2404.16407</a> <span> [<a href="https://arxiv.org/pdf/2404.16407">pdf</a>, <a href="https://arxiv.org/format/2404.16407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> U2++ MoE: Scaling 4.7x parameters with minimal impact on RTF </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Song%2C+X">Xingchen Song</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+D">Dinghao Zhou</a>, <a href="/search/eess?searchtype=author&query=Peng%2C+Z">Zhendong Peng</a>, <a href="/search/eess?searchtype=author&query=Dang%2C+B">Bo Dang</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+F">Fuping Pan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Chao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.16407v2-abstract-short" style="display: inline;"> Scale has opened new frontiers in natural language processing, but at a high cost. In response, by learning to only activate a subset of parameters in training and inference, Mixture-of-Experts (MoE) have been proposed as an energy efficient path to even larger and more capable language models and this shift towards a new generation of foundation models is gaining momentum, particularly within the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16407v2-abstract-full').style.display = 'inline'; document.getElementById('2404.16407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.16407v2-abstract-full" style="display: none;"> Scale has opened new frontiers in natural language processing, but at a high cost. In response, by learning to only activate a subset of parameters in training and inference, Mixture-of-Experts (MoE) have been proposed as an energy efficient path to even larger and more capable language models and this shift towards a new generation of foundation models is gaining momentum, particularly within the field of Automatic Speech Recognition (ASR). Recent works that incorporating MoE into ASR models have complex designs such as routing frames via supplementary embedding network, improving multilingual ability for the experts, and utilizing dedicated auxiliary losses for either expert load balancing or specific language handling. We found that delicate designs are not necessary, while an embarrassingly simple substitution of MoE layers for all Feed-Forward Network (FFN) layers is competent for the ASR task. To be more specific, we benchmark our proposed model on a large scale inner-source dataset (160k hours), the results show that we can scale our baseline Conformer (Dense-225M) to its MoE counterparts (MoE-1B) and achieve Dense-1B level Word Error Rate (WER) while maintaining a Dense-225M level Real Time Factor (RTF). Furthermore, by applying Unified 2-pass framework with bidirectional attention decoders (U2++), we achieve the streaming and non-streaming decoding modes in a single MoE based model, which we call U2++ MoE. We hope that our study can facilitate the research on scaling speech foundation models without sacrificing deployment efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.16407v2-abstract-full').style.display = 'none'; document.getElementById('2404.16407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08805">arXiv:2404.08805</a> <span> [<a href="https://arxiv.org/pdf/2404.08805">pdf</a>, <a href="https://arxiv.org/format/2404.08805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1117/12.2611097">10.1117/12.2611097 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Real-time guidewire tracking and segmentation in intraoperative x-ray </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baochang Zhang</a>, <a href="/search/eess?searchtype=author&query=Bui%2C+M">Mai Bui</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Cheng Wang</a>, <a href="/search/eess?searchtype=author&query=Bourier%2C+F">Felix Bourier</a>, <a href="/search/eess?searchtype=author&query=Schunkert%2C+H">Heribert Schunkert</a>, <a href="/search/eess?searchtype=author&query=Navab%2C+N">Nassir Navab</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08805v1-abstract-short" style="display: inline;"> During endovascular interventions, physicians have to perform accurate and immediate operations based on the available real-time information, such as the shape and position of guidewires observed on the fluoroscopic images, haptic information and the patients' physiological signals. For this purpose, real-time and accurate guidewire segmentation and tracking can enhance the visualization of guidew… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08805v1-abstract-full').style.display = 'inline'; document.getElementById('2404.08805v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08805v1-abstract-full" style="display: none;"> During endovascular interventions, physicians have to perform accurate and immediate operations based on the available real-time information, such as the shape and position of guidewires observed on the fluoroscopic images, haptic information and the patients' physiological signals. For this purpose, real-time and accurate guidewire segmentation and tracking can enhance the visualization of guidewires and provide visual feedback for physicians during the intervention as well as for robot-assisted interventions. Nevertheless, this task often comes with the challenge of elongated deformable structures that present themselves with low contrast in the noisy fluoroscopic image sequences. To address these issues, a two-stage deep learning framework for real-time guidewire segmentation and tracking is proposed. In the first stage, a Yolov5s detector is trained, using the original X-ray images as well as synthetic ones, which is employed to output the bounding boxes of possible target guidewires. More importantly, a refinement module based on spatiotemporal constraints is incorporated to robustly localize the guidewire and remove false detections. In the second stage, a novel and efficient network is proposed to segment the guidewire in each detected bounding box. The network contains two major modules, namely a hessian-based enhancement embedding module and a dual self-attention module. Quantitative and qualitative evaluations on clinical intra-operative images demonstrate that the proposed approach significantly outperforms our baselines as well as the current state of the art and, in comparison, shows higher robustness to low quality images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08805v1-abstract-full').style.display = 'none'; document.getElementById('2404.08805v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07551">arXiv:2404.07551</a> <span> [<a href="https://arxiv.org/pdf/2404.07551">pdf</a>, <a href="https://arxiv.org/format/2404.07551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Event-Enhanced Snapshot Compressive Videography at 10K FPS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/eess?searchtype=author&query=Suo%2C+J">Jinli Suo</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Q">Qionghai Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07551v1-abstract-short" style="display: inline;"> Video snapshot compressive imaging (SCI) encodes the target dynamic scene compactly into a snapshot and reconstructs its high-speed frame sequence afterward, greatly reducing the required data footprint and transmission bandwidth as well as enabling high-speed imaging with a low frame rate intensity camera. In implementation, high-speed dynamics are encoded via temporally varying patterns, and onl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07551v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07551v1-abstract-full" style="display: none;"> Video snapshot compressive imaging (SCI) encodes the target dynamic scene compactly into a snapshot and reconstructs its high-speed frame sequence afterward, greatly reducing the required data footprint and transmission bandwidth as well as enabling high-speed imaging with a low frame rate intensity camera. In implementation, high-speed dynamics are encoded via temporally varying patterns, and only frames at corresponding temporal intervals can be reconstructed, while the dynamics occurring between consecutive frames are lost. To unlock the potential of conventional snapshot compressive videography, we propose a novel hybrid "intensity+event" imaging scheme by incorporating an event camera into a video SCI setup. Our proposed system consists of a dual-path optical setup to record the coded intensity measurement and intermediate event signals simultaneously, which is compact and photon-efficient by collecting the half photons discarded in conventional video SCI. Correspondingly, we developed a dual-branch Transformer utilizing the reciprocal relationship between two data modes to decode dense video frames. Extensive experiments on both simulated and real-captured data demonstrate our superiority to state-of-the-art video SCI and video frame interpolation (VFI) methods. Benefiting from the new hybrid design leveraging both intrinsic redundancy in videos and the unique feature of event cameras, we achieve high-quality videography at 0.1ms time intervals with a low-cost CMOS image sensor working at 24 FPS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07551v1-abstract-full').style.display = 'none'; document.getElementById('2404.07551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07188">arXiv:2404.07188</a> <span> [<a href="https://arxiv.org/pdf/2404.07188">pdf</a>, <a href="https://arxiv.org/format/2404.07188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> GCV-Turbo: End-to-end Acceleration of GNN-based Computer Vision Tasks on FPGA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bingyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Kannan%2C+R">Rajgopal Kannan</a>, <a href="/search/eess?searchtype=author&query=Busart%2C+C">Carl Busart</a>, <a href="/search/eess?searchtype=author&query=Prasanna%2C+V">Viktor Prasanna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07188v1-abstract-short" style="display: inline;"> Graph neural networks (GNNs) have recently empowered various novel computer vision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN layers or only GNN layers are employed. This paper introduces GCV-Turbo, a domain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV tasks. GCV-Turbo consists of two key components: (1) a \emph{novel} hardware architecture o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07188v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07188v1-abstract-full" style="display: none;"> Graph neural networks (GNNs) have recently empowered various novel computer vision (CV) tasks. In GNN-based CV tasks, a combination of CNN layers and GNN layers or only GNN layers are employed. This paper introduces GCV-Turbo, a domain-specific accelerator on FPGA for end-to-end acceleration of GNN-based CV tasks. GCV-Turbo consists of two key components: (1) a \emph{novel} hardware architecture optimized for the computation kernels in both CNNs and GNNs using the same set of computation resources. (2) a PyTorch-compatible compiler that takes a user-defined model as input, performs end-to-end optimization for the computation graph of a given GNN-based CV task, and produces optimized code for hardware execution. The hardware architecture and the compiler work synergistically to support a variety of GNN-based CV tasks. We implement GCV-Turbo on a state-of-the-art FPGA and evaluate its performance across six representative GNN-based CV tasks with diverse input data modalities (e.g., image, human skeleton, point cloud). Compared with state-of-the-art CPU (GPU) implementations, GCV-Turbo achieves an average latency reduction of $68.4\times$ ($4.1\times$) on these six GNN-based CV tasks. Moreover, GCV-Turbo supports the execution of the standalone CNNs or GNNs, achieving performance comparable to that of state-of-the-art CNN (GNN) accelerators for widely used CNN-only (GNN-only) models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07188v1-abstract-full').style.display = 'none'; document.getElementById('2404.07188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17598">arXiv:2403.17598</a> <span> [<a href="https://arxiv.org/pdf/2403.17598">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Receiver Resonant Frequency Adaptive Tracking in Wireless Power Transfer Systems Using Primary Variable Capacitor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/eess?searchtype=author&query=Han%2C+W">Wei Han</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+G">Guangyu Yan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bowang Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chunlin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17598v2-abstract-short" style="display: inline;"> Parameter variations within the resonant network of wireless power transfer (WPT) systems can cause drift in the resonant frequency, leading to a detuned system that requires higher power capacity and experiences reduced transfer efficiency. To address this issue, this paper presents an adaptive online receiver resonant frequency tracking scheme based solely on primary-side detection. The proposed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17598v2-abstract-full').style.display = 'inline'; document.getElementById('2403.17598v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17598v2-abstract-full" style="display: none;"> Parameter variations within the resonant network of wireless power transfer (WPT) systems can cause drift in the resonant frequency, leading to a detuned system that requires higher power capacity and experiences reduced transfer efficiency. To address this issue, this paper presents an adaptive online receiver resonant frequency tracking scheme based solely on primary-side detection. The proposed method effectively compensates for parameter fluctuations in both primary and secondary resonators. The core of this approach is a switch-controlled capacitor (SCC) with a control angle calibrated during a system self-check process prior to high-power charging. Additionally, a two-step perturb-and-observe algorithm has been developed to perform online tracking while minimizing disturbances to the output power. Post-tracking, zero-voltage switching (ZVS) conditions can be achieved within a specified detuning range. To validate the efficacy of the proposed system, a 200W experimental platform was constructed. The measured results demonstrate that resonance is consistently maintained within the 79-90 kHz frequency range, as specified by the SAE J2954 standard. The maximum frequency tracking error and efficiency increase are 0.7 kHz and 9%, respectively. Notably, the tracking process is completed in less than 1 ms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17598v2-abstract-full').style.display = 'none'; document.getElementById('2403.17598v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15616">arXiv:2403.15616</a> <span> [<a href="https://arxiv.org/pdf/2403.15616">pdf</a>, <a href="https://arxiv.org/format/2403.15616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Balancing Fairness and Efficiency in Energy Resource Allocations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/eess?searchtype=author&query=Motoki%2C+M">Matthew Motoki</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baosen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15616v1-abstract-short" style="display: inline;"> Bringing fairness to energy resource allocation remains a challenge, due to the complexity of system structures and economic interdependencies among users and system operators' decision-making. The rise of distributed energy resources has introduced more diverse heterogeneous user groups, surpassing the capabilities of traditional efficiency-oriented allocation schemes. Without explicitly bringing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15616v1-abstract-full').style.display = 'inline'; document.getElementById('2403.15616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15616v1-abstract-full" style="display: none;"> Bringing fairness to energy resource allocation remains a challenge, due to the complexity of system structures and economic interdependencies among users and system operators' decision-making. The rise of distributed energy resources has introduced more diverse heterogeneous user groups, surpassing the capabilities of traditional efficiency-oriented allocation schemes. Without explicitly bringing fairness to user-system interaction, this disparity often leads to disproportionate payments for certain user groups due to their utility formats or group sizes. Our paper addresses this challenge by formalizing the problem of fair energy resource allocation and introducing the framework for aggregators. This framework enables optimal fairness-efficiency trade-offs by selecting appropriate objectives in a principled way. By jointly optimizing over the total resources to allocate and individual allocations, our approach reveals optimized allocation schemes that lie on the Pareto front, balancing fairness and efficiency in resource allocation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15616v1-abstract-full').style.display = 'none'; document.getElementById('2403.15616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.14508">arXiv:2403.14508</a> <span> [<a href="https://arxiv.org/pdf/2403.14508">pdf</a>, <a href="https://arxiv.org/format/2403.14508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Constrained Reinforcement Learning with Smoothed Log Barrier Function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baohe Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Frison%2C+L">Lilli Frison</a>, <a href="/search/eess?searchtype=author&query=Brox%2C+T">Thomas Brox</a>, <a href="/search/eess?searchtype=author&query=B%C3%B6decker%2C+J">Joschka B枚decker</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.14508v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) has been widely applied to many control tasks and substantially improved the performances compared to conventional control methods in many domains where the reward function is well defined. However, for many real-world problems, it is often more convenient to formulate optimization problems in terms of rewards and constraints simultaneously. Optimizing such constrained… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14508v1-abstract-full').style.display = 'inline'; document.getElementById('2403.14508v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.14508v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) has been widely applied to many control tasks and substantially improved the performances compared to conventional control methods in many domains where the reward function is well defined. However, for many real-world problems, it is often more convenient to formulate optimization problems in terms of rewards and constraints simultaneously. Optimizing such constrained problems via reward shaping can be difficult as it requires tedious manual tuning of reward functions with several interacting terms. Recent formulations which include constraints mostly require a pre-training phase, which often needs human expertise to collect data or assumes having a sub-optimal policy readily available. We propose a new constrained RL method called CSAC-LB (Constrained Soft Actor-Critic with Log Barrier Function), which achieves competitive performance without any pre-training by applying a linear smoothed log barrier function to an additional safety critic. It implements an adaptive penalty for policy learning and alleviates the numerical issues that are known to complicate the application of the log barrier function method. As a result, we show that with CSAC-LB, we achieve state-of-the-art performance on several constrained control tasks with different levels of difficulty and evaluate our methods in a locomotion task on a real quadruped robot platform. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.14508v1-abstract-full').style.display = 'none'; document.getElementById('2403.14508v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05989">arXiv:2403.05989</a> <span> [<a href="https://arxiv.org/pdf/2403.05989">pdf</a>, <a href="https://arxiv.org/format/2403.05989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> HAM-TTS: Hierarchical Acoustic Modeling for Token-Based Zero-Shot Text-to-Speech with Model and Data Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chunhui Wang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+C">Chang Zeng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bowen Zhang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+Z">Ziyang Ma</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Y">Yefan Zhu</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Z">Zifeng Cai</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+J">Jian Zhao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Z">Zhonglin Jiang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05989v1-abstract-short" style="display: inline;"> Token-based text-to-speech (TTS) models have emerged as a promising avenue for generating natural and realistic speech, yet they grapple with low pronunciation accuracy, speaking style and timbre inconsistency, and a substantial need for diverse training data. In response, we introduce a novel hierarchical acoustic modeling approach complemented by a tailored data augmentation strategy and train i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05989v1-abstract-full').style.display = 'inline'; document.getElementById('2403.05989v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05989v1-abstract-full" style="display: none;"> Token-based text-to-speech (TTS) models have emerged as a promising avenue for generating natural and realistic speech, yet they grapple with low pronunciation accuracy, speaking style and timbre inconsistency, and a substantial need for diverse training data. In response, we introduce a novel hierarchical acoustic modeling approach complemented by a tailored data augmentation strategy and train it on the combination of real and synthetic data, scaling the data size up to 650k hours, leading to the zero-shot TTS model with 0.8B parameters. Specifically, our method incorporates a latent variable sequence containing supplementary acoustic information based on refined self-supervised learning (SSL) discrete units into the TTS model by a predictor. This significantly mitigates pronunciation errors and style mutations in synthesized speech. During training, we strategically replace and duplicate segments of the data to enhance timbre uniformity. Moreover, a pretrained few-shot voice conversion model is utilized to generate a plethora of voices with identical content yet varied timbres. This facilitates the explicit learning of utterance-level one-to-many mappings, enriching speech diversity and also ensuring consistency in timbre. Comparative experiments (Demo page: https://anonymous.4open.science/w/ham-tts/)demonstrate our model's superiority over VALL-E in pronunciation precision and maintaining speaking style, as well as timbre continuity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05989v1-abstract-full').style.display = 'none'; document.getElementById('2403.05989v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16765">arXiv:2402.16765</a> <span> [<a href="https://arxiv.org/pdf/2402.16765">pdf</a>, <a href="https://arxiv.org/format/2402.16765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Oscillations-Aware Frequency Security Assessment via Efficient Worst-Case Frequency Nadir Computation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yan Jiang</a>, <a href="/search/eess?searchtype=author&query=Min%2C+H">Hancheng Min</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Baosen Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16765v1-abstract-short" style="display: inline;"> Frequency security assessment following major disturbances has long been one of the central tasks in power system operations. The standard approach is to study the center of inertia frequency, an aggregate signal for an entire system, to avoid analyzing the frequency signal at individual buses. However, as the amount of low-inertia renewable resources in a grid increases, the center of inertia fre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16765v1-abstract-full').style.display = 'inline'; document.getElementById('2402.16765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16765v1-abstract-full" style="display: none;"> Frequency security assessment following major disturbances has long been one of the central tasks in power system operations. The standard approach is to study the center of inertia frequency, an aggregate signal for an entire system, to avoid analyzing the frequency signal at individual buses. However, as the amount of low-inertia renewable resources in a grid increases, the center of inertia frequency is becoming too coarse to provide reliable frequency security assessment. In this paper, we propose an efficient algorithm to determine the worst-case frequency nadir across all buses for bounded power disturbances, as well as identify the power disturbances leading to that severest scenario. The proposed algorithm allows oscillations-aware frequency security assessment without conducting exhaustive simulations and intractable analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16765v1-abstract-full').style.display = 'none'; document.getElementById('2402.16765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.15335">arXiv:2402.15335</a> <span> [<a href="https://arxiv.org/pdf/2402.15335">pdf</a>, <a href="https://arxiv.org/format/2402.15335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Low-Rank Representations Meets Deep Unfolding: A Generalized and Interpretable Network for Hyperspectral Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bing Zhang</a>, <a href="/search/eess?searchtype=author&query=Hong%2C+D">Danfeng Hong</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+J">Jing Yao</a>, <a href="/search/eess?searchtype=author&query=Chanussot%2C+J">Jocelyn Chanussot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.15335v1-abstract-short" style="display: inline;"> Current hyperspectral anomaly detection (HAD) benchmark datasets suffer from low resolution, simple background, and small size of the detection data. These factors also limit the performance of the well-known low-rank representation (LRR) models in terms of robustness on the separation of background and target features and the reliance on manual parameter selection. To this end, we build a new set… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15335v1-abstract-full').style.display = 'inline'; document.getElementById('2402.15335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.15335v1-abstract-full" style="display: none;"> Current hyperspectral anomaly detection (HAD) benchmark datasets suffer from low resolution, simple background, and small size of the detection data. These factors also limit the performance of the well-known low-rank representation (LRR) models in terms of robustness on the separation of background and target features and the reliance on manual parameter selection. To this end, we build a new set of HAD benchmark datasets for improving the robustness of the HAD algorithm in complex scenarios, AIR-HAD for short. Accordingly, we propose a generalized and interpretable HAD network by deeply unfolding a dictionary-learnable LLR model, named LRR-Net$^+$, which is capable of spectrally decoupling the background structure and object properties in a more generalized fashion and eliminating the bias introduced by vital interference targets concurrently. In addition, LRR-Net$^+$ integrates the solution process of the Alternating Direction Method of Multipliers (ADMM) optimizer with the deep network, guiding its search process and imparting a level of interpretability to parameter optimization. Additionally, the integration of physical models with DL techniques eliminates the need for manual parameter tuning. The manually tuned parameters are seamlessly transformed into trainable parameters for deep neural networks, facilitating a more efficient and automated optimization process. Extensive experiments conducted on the AIR-HAD dataset show the superiority of our LRR-Net$^+$ in terms of detection performance and generalization ability, compared to top-performing rivals. Furthermore, the compilable codes and our AIR-HAD benchmark datasets in this paper will be made available freely and openly at \url{https://sites.google.com/view/danfeng-hong}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.15335v1-abstract-full').style.display = 'none'; document.getElementById('2402.15335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01194">arXiv:2402.01194</a> <span> [<a href="https://arxiv.org/pdf/2402.01194">pdf</a>, <a href="https://arxiv.org/format/2402.01194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Robust Super-resolution Gridless Imaging Framework for UAV-borne SAR Tomography </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+S">Silin Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenlong Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+M">Muhan Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhe Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zai Yang</a>, <a href="/search/eess?searchtype=author&query=Qiu%2C+X">Xiaolan Qiu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bingchen Zhang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yirong Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01194v1-abstract-short" style="display: inline;"> Synthetic aperture radar (SAR) tomography (TomoSAR) retrieves three-dimensional (3-D) information from multiple SAR images, effectively addresses the layover problem, and has become pivotal in urban mapping. Unmanned aerial vehicle (UAV) has gained popularity as a TomoSAR platform, offering distinct advantages such as the ability to achieve 3-D imaging in a single flight, cost-effectiveness, rapid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01194v1-abstract-full').style.display = 'inline'; document.getElementById('2402.01194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01194v1-abstract-full" style="display: none;"> Synthetic aperture radar (SAR) tomography (TomoSAR) retrieves three-dimensional (3-D) information from multiple SAR images, effectively addresses the layover problem, and has become pivotal in urban mapping. Unmanned aerial vehicle (UAV) has gained popularity as a TomoSAR platform, offering distinct advantages such as the ability to achieve 3-D imaging in a single flight, cost-effectiveness, rapid deployment, and flexible trajectory planning. The evolution of compressed sensing (CS) has led to the widespread adoption of sparse reconstruction techniques in TomoSAR signal processing, with a focus on $\ell _1$ norm regularization and other grid-based CS methods. However, the discretization of illuminated scene along elevation introduces modeling errors, resulting in reduced reconstruction accuracy, known as the "off-grid" effect. Recent advancements have introduced gridless CS algorithms to mitigate this issue. This paper presents an innovative gridless 3-D imaging framework tailored for UAV-borne TomoSAR. Capitalizing on the pulse repetition frequency (PRF) redundancy inherent in slow UAV platforms, a multiple measurement vectors (MMV) model is constructed to enhance noise immunity without compromising azimuth-range resolution. Given the sparsely placed array elements due to mounting platform constraints, an atomic norm soft thresholding algorithm is proposed for partially observed MMV, offering gridless reconstruction capability and super-resolution. An efficient alternative optimization algorithm is also employed to enhance computational efficiency. Validation of the proposed framework is achieved through computer simulations and flight experiments, affirming its efficacy in UAV-borne TomoSAR applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01194v1-abstract-full').style.display = 'none'; document.getElementById('2402.01194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10242">arXiv:2401.10242</a> <span> [<a href="https://arxiv.org/pdf/2401.10242">pdf</a>, <a href="https://arxiv.org/format/2401.10242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Other Computer Science">cs.OH</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DanceMeld: Unraveling Dance Phrases with Hierarchical Latent Codes for Music-to-Dance Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+L">Li Hu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peng Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bang Zhang</a>, <a href="/search/eess?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10242v1-abstract-short" style="display: inline;"> In the realm of 3D digital human applications, music-to-dance presents a challenging task. Given the one-to-many relationship between music and dance, previous methods have been limited in their approach, relying solely on matching and generating corresponding dance movements based on music rhythm. In the professional field of choreography, a dance phrase consists of several dance poses and dance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10242v1-abstract-full').style.display = 'inline'; document.getElementById('2401.10242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10242v1-abstract-full" style="display: none;"> In the realm of 3D digital human applications, music-to-dance presents a challenging task. Given the one-to-many relationship between music and dance, previous methods have been limited in their approach, relying solely on matching and generating corresponding dance movements based on music rhythm. In the professional field of choreography, a dance phrase consists of several dance poses and dance movements. Dance poses composed of a series of basic meaningful body postures, while dance movements can reflect dynamic changes such as the rhythm, melody, and style of dance. Taking inspiration from these concepts, we introduce an innovative dance generation pipeline called DanceMeld, which comprising two stages, i.e., the dance decouple stage and the dance generation stage. In the decouple stage, a hierarchical VQ-VAE is used to disentangle dance poses and dance movements in different feature space levels, where the bottom code represents dance poses, and the top code represents dance movements. In the generation stage, we utilize a diffusion model as a prior to model the distribution and generate latent codes conditioned on music features. We have experimentally demonstrated the representational capabilities of top code and bottom code, enabling the explicit decoupling expression of dance poses and dance movements. This disentanglement not only provides control over motion details, styles, and rhythm but also facilitates applications such as dance style transfer and dance unit editing. Our approach has undergone qualitative and quantitative experiments on the AIST++ dataset, demonstrating its superiority over other methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10242v1-abstract-full').style.display = 'none'; document.getElementById('2401.10242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08049">arXiv:2401.08049</a> <span> [<a href="https://arxiv.org/pdf/2401.08049">pdf</a>, <a href="https://arxiv.org/format/2401.08049">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> EmoTalker: Emotionally Editable Talking Face Generation via Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bingyuan Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xulong Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+N">Ning Cheng</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+J">Jun Yu</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+J">Jing Xiao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jianzong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08049v1-abstract-short" style="display: inline;"> In recent years, the field of talking faces generation has attracted considerable attention, with certain methods adept at generating virtual faces that convincingly imitate human expressions. However, existing methods face challenges related to limited generalization, particularly when dealing with challenging identities. Furthermore, methods for editing expressions are often confined to a singul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08049v1-abstract-full').style.display = 'inline'; document.getElementById('2401.08049v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08049v1-abstract-full" style="display: none;"> In recent years, the field of talking faces generation has attracted considerable attention, with certain methods adept at generating virtual faces that convincingly imitate human expressions. However, existing methods face challenges related to limited generalization, particularly when dealing with challenging identities. Furthermore, methods for editing expressions are often confined to a singular emotion, failing to adapt to intricate emotions. To overcome these challenges, this paper proposes EmoTalker, an emotionally editable portraits animation approach based on the diffusion model. EmoTalker modifies the denoising process to ensure preservation of the original portrait's identity during inference. To enhance emotion comprehension from text input, Emotion Intensity Block is introduced to analyze fine-grained emotions and strengths derived from prompts. Additionally, a crafted dataset is harnessed to enhance emotion comprehension within prompts. Experiments show the effectiveness of EmoTalker in generating high-quality, emotionally customizable facial expressions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08049v1-abstract-full').style.display = 'none'; document.getElementById('2401.08049v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2024 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03473">arXiv:2401.03473</a> <span> [<a href="https://arxiv.org/pdf/2401.03473">pdf</a>, <a href="https://arxiv.org/ps/2401.03473">ps</a>, <a href="https://arxiv.org/format/2401.03473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ICMC-ASR: The ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yue Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+J">Jiayao Sun</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wei Chen</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/eess?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+J">Jian Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Longbiao Wang</a>, <a href="/search/eess?searchtype=author&query=Chng%2C+E+S">Eng Siong Chng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sun Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03473v3-abstract-short" style="display: inline;"> To promote speech processing and recognition research in driving scenarios, we build on the success of the Intelligent Cockpit Speech Recognition Challenge (ICSRC) held at ISCSLP 2022 and launch the ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition (ICMC-ASR) Challenge. This challenge collects over 100 hours of multi-channel speech data recorded inside a new energy vehicle and 40 hours… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03473v3-abstract-full').style.display = 'inline'; document.getElementById('2401.03473v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03473v3-abstract-full" style="display: none;"> To promote speech processing and recognition research in driving scenarios, we build on the success of the Intelligent Cockpit Speech Recognition Challenge (ICSRC) held at ISCSLP 2022 and launch the ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition (ICMC-ASR) Challenge. This challenge collects over 100 hours of multi-channel speech data recorded inside a new energy vehicle and 40 hours of noise for data augmentation. Two tracks, including automatic speech recognition (ASR) and automatic speech diarization and recognition (ASDR) are set up, using character error rate (CER) and concatenated minimum permutation character error rate (cpCER) as evaluation metrics, respectively. Overall, the ICMC-ASR Challenge attracts 98 participating teams and receives 53 valid results in both tracks. In the end, first-place team USTCiflytek achieves a CER of 13.16% in the ASR track and a cpCER of 21.48% in the ASDR track, showing an absolute improvement of 13.08% and 51.4% compared to our challenge baseline, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03473v3-abstract-full').style.display = 'none'; document.getElementById('2401.03473v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.02687">arXiv:2401.02687</a> <span> [<a href="https://arxiv.org/pdf/2401.02687">pdf</a>, <a href="https://arxiv.org/format/2401.02687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> PAHD: Perception-Action based Human Decision Making using Explainable Graph Neural Networks on SAR Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wijeratne%2C+S">Sasindu Wijeratne</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bingyi Zhang</a>, <a href="/search/eess?searchtype=author&query=Kannan%2C+R">Rajgopal Kannan</a>, <a href="/search/eess?searchtype=author&query=Prasanna%2C+V">Viktor Prasanna</a>, <a href="/search/eess?searchtype=author&query=Busart%2C+C">Carl Busart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.02687v1-abstract-short" style="display: inline;"> Synthetic Aperture Radar (SAR) images are commonly utilized in military applications for automatic target recognition (ATR). Machine learning (ML) methods, such as Convolutional Neural Networks (CNN) and Graph Neural Networks (GNN), are frequently used to identify ground-based objects, including battle tanks, personnel carriers, and missile launchers. Determining the vehicle class, such as the BRD… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.02687v1-abstract-full').style.display = 'inline'; document.getElementById('2401.02687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.02687v1-abstract-full" style="display: none;"> Synthetic Aperture Radar (SAR) images are commonly utilized in military applications for automatic target recognition (ATR). Machine learning (ML) methods, such as Convolutional Neural Networks (CNN) and Graph Neural Networks (GNN), are frequently used to identify ground-based objects, including battle tanks, personnel carriers, and missile launchers. Determining the vehicle class, such as the BRDM2 tank, BMP2 tank, BTR60 tank, and BTR70 tank, is crucial, as it can help determine whether the target object is an ally or an enemy. While the ML algorithm provides feedback on the recognized target, the final decision is left to the commanding officers. Therefore, providing detailed information alongside the identified target can significantly impact their actions. This detailed information includes the SAR image features that contributed to the classification, the classification confidence, and the probability of the identified object being classified as a different object type or class. We propose a GNN-based ATR framework that provides the final classified class and outputs the detailed information mentioned above. This is the first study to provide a detailed analysis of the classification class, making final decisions more straightforward. Moreover, our GNN framework achieves an overall accuracy of 99.2\% when evaluated on the MSTAR dataset, improving over previous state-of-the-art GNN methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.02687v1-abstract-full').style.display = 'none'; document.getElementById('2401.02687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15863">arXiv:2312.15863</a> <span> [<a href="https://arxiv.org/pdf/2312.15863">pdf</a>, <a href="https://arxiv.org/format/2312.15863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> PDiT: Interleaving Perception and Decision-making Transformers for Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Mao%2C+H">Hangyu Mao</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+R">Rui Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Ziyue Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Z">Zhiwei Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiqun Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Z">Zhen Xiao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Junge Zhang</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+J">Jiangjin Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15863v1-abstract-short" style="display: inline;"> Designing better deep networks and better reinforcement learning (RL) algorithms are both important for deep RL. This work studies the former. Specifically, the Perception and Decision-making Interleaving Transformer (PDiT) network is proposed, which cascades two Transformers in a very natural way: the perceiving one focuses on \emph{the environmental perception} by processing the observation at t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15863v1-abstract-full').style.display = 'inline'; document.getElementById('2312.15863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15863v1-abstract-full" style="display: none;"> Designing better deep networks and better reinforcement learning (RL) algorithms are both important for deep RL. This work studies the former. Specifically, the Perception and Decision-making Interleaving Transformer (PDiT) network is proposed, which cascades two Transformers in a very natural way: the perceiving one focuses on \emph{the environmental perception} by processing the observation at the patch level, whereas the deciding one pays attention to \emph{the decision-making} by conditioning on the history of the desired returns, the perceiver's outputs, and the actions. Such a network design is generally applicable to a lot of deep RL settings, e.g., both the online and offline RL algorithms under environments with either image observations, proprioception observations, or hybrid image-language observations. Extensive experiments show that PDiT can not only achieve superior performance than strong baselines in different settings but also extract explainable feature representations. Our code is available at \url{https://github.com/maohangyu/PDiT}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15863v1-abstract-full').style.display = 'none'; document.getElementById('2312.15863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Proc. of the 23rd International Conference on Autonomous Agents and Multiagent Systems (AAMAS 2024, full paper with oral presentation). Cover our preliminary study: arXiv:2212.14538</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.13752">arXiv:2312.13752</a> <span> [<a href="https://arxiv.org/pdf/2312.13752">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.media.2024.103253">10.1016/j.media.2024.103253 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Hunting imaging biomarkers in pulmonary fibrosis: Benchmarks of the AIIB23 challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Nan%2C+Y">Yang Nan</a>, <a href="/search/eess?searchtype=author&query=Xing%2C+X">Xiaodan Xing</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shiyi Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Z">Zeyu Tang</a>, <a href="/search/eess?searchtype=author&query=Felder%2C+F+N">Federico N Felder</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Sheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Ledda%2C+R+E">Roberta Eufrasia Ledda</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+X">Xiaoliu Ding</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+R">Ruiqi Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Weiping Liu</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+F">Feng Shi</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+T">Tianyang Sun</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Z">Zehong Cao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yun Gu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanxiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+J">Jian Gao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+P">Pingyu Wang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+W">Wen Tang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+P">Pengxin Yu</a>, <a href="/search/eess?searchtype=author&query=Kang%2C+H">Han Kang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+J">Junqiang Chen</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+X">Xing Lu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Boyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Mamalakis%2C+M">Michail Mamalakis</a> , et al. (16 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.13752v2-abstract-short" style="display: inline;"> Airway-related quantitative imaging biomarkers are crucial for examination, diagnosis, and prognosis in pulmonary diseases. However, the manual delineation of airway trees remains prohibitively time-consuming. While significant efforts have been made towards enhancing airway modelling, current public-available datasets concentrate on lung diseases with moderate morphological variations. The intric… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13752v2-abstract-full').style.display = 'inline'; document.getElementById('2312.13752v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.13752v2-abstract-full" style="display: none;"> Airway-related quantitative imaging biomarkers are crucial for examination, diagnosis, and prognosis in pulmonary diseases. However, the manual delineation of airway trees remains prohibitively time-consuming. While significant efforts have been made towards enhancing airway modelling, current public-available datasets concentrate on lung diseases with moderate morphological variations. The intricate honeycombing patterns present in the lung tissues of fibrotic lung disease patients exacerbate the challenges, often leading to various prediction errors. To address this issue, the 'Airway-Informed Quantitative CT Imaging Biomarker for Fibrotic Lung Disease 2023' (AIIB23) competition was organized in conjunction with the official 2023 International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI). The airway structures were meticulously annotated by three experienced radiologists. Competitors were encouraged to develop automatic airway segmentation models with high robustness and generalization abilities, followed by exploring the most correlated QIB of mortality prediction. A training set of 120 high-resolution computerised tomography (HRCT) scans were publicly released with expert annotations and mortality status. The online validation set incorporated 52 HRCT scans from patients with fibrotic lung disease and the offline test set included 140 cases from fibrosis and COVID-19 patients. The results have shown that the capacity of extracting airway trees from patients with fibrotic lung disease could be enhanced by introducing voxel-wise weighted general union loss and continuity loss. In addition to the competitive image biomarkers for prognosis, a strong airway-derived biomarker (Hazard ratio>1.5, p<0.0001) was revealed for survival prognostication compared with existing clinical measurements, clinician assessment and AI-based biomarkers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.13752v2-abstract-full').style.display = 'none'; document.getElementById('2312.13752v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11930">arXiv:2312.11930</a> <span> [<a href="https://arxiv.org/pdf/2312.11930">pdf</a>, <a href="https://arxiv.org/format/2312.11930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> InPTC: Integrated Planning and Tube-Following Control for Prescribed-Time Collision-Free Navigation of Wheeled Mobile Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Shao%2C+X">Xiaodong Shao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhi%2C+H">Hui Zhi</a>, <a href="/search/eess?searchtype=author&query=Romero%2C+J+G">Jose Guadalupe Romero</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+B">Bowen Fan</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Q">Qinglei Hu</a>, <a href="/search/eess?searchtype=author&query=Navarro-Alarcon%2C+D">David Navarro-Alarcon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11930v2-abstract-short" style="display: inline;"> In this article, we propose a novel approach, called InPTC (Integrated Planning and Tube-Following Control), for prescribed-time collision-free navigation of wheeled mobile robots in a compact convex workspace cluttered with static, sufficiently separated, and convex obstacles. A path planner with prescribed-time convergence is presented based upon Bouligand's tangent cones and time scale transfor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11930v2-abstract-full').style.display = 'inline'; document.getElementById('2312.11930v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11930v2-abstract-full" style="display: none;"> In this article, we propose a novel approach, called InPTC (Integrated Planning and Tube-Following Control), for prescribed-time collision-free navigation of wheeled mobile robots in a compact convex workspace cluttered with static, sufficiently separated, and convex obstacles. A path planner with prescribed-time convergence is presented based upon Bouligand's tangent cones and time scale transformation (TST) techniques, yielding a continuous vector field that can guide the robot from almost all initial positions in the free space to the designated goal at a prescribed time, while avoiding entering the obstacle regions augmented with safety margin. By leveraging barrier functions and TST, we further derive a tube-following controller to achieve robot trajectory tracking within a prescribed time less than the planner's settling time. This controller ensures the robot moves inside a predefined ``safe tube'' around the reference trajectory, where the tube radius is set to be less than the safety margin. Consequently, the robot will reach the goal location within a prescribed time while avoiding collision with any obstacles along the way. The proposed InPTC is implemented on a Mona robot operating in an arena cluttered with obstacles of various shapes. Experimental results demonstrate that InPTC not only generates smooth collision-free reference trajectories that converge to the goal location at the preassigned time of $250\,\rm s$ (i.e., the required task completion time), but also achieves tube-following trajectory tracking with tracking accuracy higher than $0.01\rm m$ after the preassigned time of $150\,\rm s$. This enables the robot to accomplish the navigation task within the required time of $250\,\rm s$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11930v2-abstract-full').style.display = 'none'; document.getElementById('2312.11930v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.13824">arXiv:2311.13824</a> <span> [<a href="https://arxiv.org/pdf/2311.13824">pdf</a>, <a href="https://arxiv.org/format/2311.13824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Constraint-Guided Online Data Selection for Scalable Data-Driven Safety Filters in Uncertain Robotic Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Choi%2C+J+J">Jason J. Choi</a>, <a href="/search/eess?searchtype=author&query=Casta%C3%B1eda%2C+F">Fernando Casta帽eda</a>, <a href="/search/eess?searchtype=author&query=Jung%2C+W">Wonsuhk Jung</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Bike Zhang</a>, <a href="/search/eess?searchtype=author&query=Tomlin%2C+C+J">Claire J. Tomlin</a>, <a href="/search/eess?searchtype=author&query=Sreenath%2C+K">Koushil Sreenath</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.13824v2-abstract-short" style="display: inline;"> As the use of autonomous robots expands in tasks that are complex and challenging to model, the demand for robust data-driven control methods that can certify safety and stability in uncertain conditions is increasing. However, the practical implementation of these methods often faces scalability issues due to the growing amount of data points with system complexity, and a significant reliance on… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13824v2-abstract-full').style.display = 'inline'; document.getElementById('2311.13824v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.13824v2-abstract-full" style="display: none;"> As the use of autonomous robots expands in tasks that are complex and challenging to model, the demand for robust data-driven control methods that can certify safety and stability in uncertain conditions is increasing. However, the practical implementation of these methods often faces scalability issues due to the growing amount of data points with system complexity, and a significant reliance on high-quality training data. In response to these challenges, this study presents a scalable data-driven controller that efficiently identifies and infers from the most informative data points for implementing data-driven safety filters. Our approach is grounded in the integration of a model-based certificate function-based method and Gaussian Process (GP) regression, reinforced by a novel online data selection algorithm that reduces time complexity from quadratic to linear relative to dataset size. Empirical evidence, gathered from successful real-world cart-pole swing-up experiments and simulated locomotion of a five-link bipedal robot, demonstrates the efficacy of our approach. Our findings reveal that our efficient online data selection algorithm, which strategically selects key data points, enhances the practicality and efficiency of data-driven certifying filters in complex robotic systems, significantly mitigating scalability concerns inherent in nonparametric learning-based control methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13824v2-abstract-full').style.display = 'none'; document.getElementById('2311.13824v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first three authors contributed equally to the work. This work has been submitted to the IEEE for possible publication</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+B&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository