CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 805 results for author: <span class="mathjax">Zhang, H</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17420">arXiv:2411.17420</a> <span> [<a href="https://arxiv.org/pdf/2411.17420">pdf</a>, <a href="https://arxiv.org/format/2411.17420">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Cross-modal Medical Image Generation Based on Pyramid Convolutional Attention Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Mao%2C+F">Fuyou Mao</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+L">Lixin Lin</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+M">Ming Jiang</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+D">Dong Dai</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+C">Chao Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+Y">Yan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17420v1-abstract-short" style="display: inline;"> The integration of multimodal medical imaging can provide complementary and comprehensive information for the diagnosis of Alzheimer's disease (AD). However, in clinical practice, since positron emission tomography (PET) is often missing, multimodal images might be incomplete. To address this problem, we propose a method that can efficiently utilize structural magnetic resonance imaging (sMRI) ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17420v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17420v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17420v1-abstract-full" style="display: none;"> The integration of multimodal medical imaging can provide complementary and comprehensive information for the diagnosis of Alzheimer's disease (AD). However, in clinical practice, since positron emission tomography (PET) is often missing, multimodal images might be incomplete. To address this problem, we propose a method that can efficiently utilize structural magnetic resonance imaging (sMRI) image information to generate high-quality PET images. Our generation model efficiently utilizes pyramid convolution combined with channel attention mechanism to extract multi-scale local features in sMRI, and injects global correlation information into these features using self-attention mechanism to ensure the restoration of the generated PET image on local texture and global structure. Additionally, we introduce additional loss functions to guide the generation model in producing higher-quality PET images. Through experiments conducted on publicly available ADNI databases, the generated images outperform previous research methods in various performance indicators (average absolute error: 0.0194, peak signal-to-noise ratio: 29.65, structural similarity: 0.9486) and are close to real images. In promoting AD diagnosis, the generated images combined with their corresponding sMRI also showed excellent performance in AD diagnosis tasks (classification accuracy: 94.21 %), and outperformed previous research methods of the same type. The experimental results demonstrate that our method outperforms other competing methods in quantitative metrics, qualitative visualization, and evaluation criteria. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17420v1-abstract-full').style.display = 'none'; document.getElementById('2411.17420v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures, Machine Vision and Applications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14816">arXiv:2411.14816</a> <span> [<a href="https://arxiv.org/pdf/2411.14816">pdf</a>, <a href="https://arxiv.org/format/2411.14816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Multi-view UAV Image Geo-localization via Iterative Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+H">Haoyuan Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+C">Chang Xu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+W">Wen Yang</a>, <a href="/search/eess?searchtype=author&query=Mi%2C+L">Li Mi</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+H">Huai Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haijian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14816v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicle (UAV) Cross-View Geo-Localization (CVGL) presents significant challenges due to the view discrepancy between oblique UAV images and overhead satellite images. Existing methods heavily rely on the supervision of labeled datasets to extract viewpoint-invariant features for cross-view retrieval. However, these methods have expensive training costs and tend to overfit the regio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14816v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14816v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicle (UAV) Cross-View Geo-Localization (CVGL) presents significant challenges due to the view discrepancy between oblique UAV images and overhead satellite images. Existing methods heavily rely on the supervision of labeled datasets to extract viewpoint-invariant features for cross-view retrieval. However, these methods have expensive training costs and tend to overfit the region-specific cues, showing limited generalizability to new regions. To overcome this issue, we propose an unsupervised solution that lifts the scene representation to 3d space from UAV observations for satellite image generation, providing robust representation against view distortion. By generating orthogonal images that closely resemble satellite views, our method reduces view discrepancies in feature representation and mitigates shortcuts in region-specific image pairing. To further align the rendered image's perspective with the real one, we design an iterative camera pose updating mechanism that progressively modulates the rendered query image with potential satellite targets, eliminating spatial offsets relative to the reference images. Additionally, this iterative refinement strategy enhances cross-view feature invariance through view-consistent fusion across iterations. As such, our unsupervised paradigm naturally avoids the problem of region-specific overfitting, enabling generic CVGL for UAV images without feature fine-tuning or data-driven training. Experiments on the University-1652 and SUES-200 datasets demonstrate that our approach significantly improves geo-localization accuracy while maintaining robustness across diverse regions. Notably, without model fine-tuning or paired training, our method achieves competitive performance with recent supervised methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14816v1-abstract-full').style.display = 'none'; document.getElementById('2411.14816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14418">arXiv:2411.14418</a> <span> [<a href="https://arxiv.org/pdf/2411.14418">pdf</a>, <a href="https://arxiv.org/format/2411.14418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-66955-2_5">10.1007/978-3-031-66955-2_5 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multimodal 3D Brain Tumor Segmentation with Adversarial Training and Conditional Random Field </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Jiang%2C+L">Lan Jiang</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Y">Yuchao Zheng</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+M">Miao Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haiqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Aladwani%2C+F">Fatemah Aladwani</a>, <a href="/search/eess?searchtype=author&query=Perelli%2C+A">Alessandro Perelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14418v1-abstract-short" style="display: inline;"> Accurate brain tumor segmentation remains a challenging task due to structural complexity and great individual differences of gliomas. Leveraging the pre-eminent detail resilience of CRF and spatial feature extraction capacity of V-net, we propose a multimodal 3D Volume Generative Adversarial Network (3D-vGAN) for precise segmentation. The model utilizes Pseudo-3D for V-net improvement, adds condi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14418v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14418v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14418v1-abstract-full" style="display: none;"> Accurate brain tumor segmentation remains a challenging task due to structural complexity and great individual differences of gliomas. Leveraging the pre-eminent detail resilience of CRF and spatial feature extraction capacity of V-net, we propose a multimodal 3D Volume Generative Adversarial Network (3D-vGAN) for precise segmentation. The model utilizes Pseudo-3D for V-net improvement, adds conditional random field after generator and use original image as supplemental guidance. Results, using the BraTS-2018 dataset, show that 3D-vGAN outperforms classical segmentation models, including U-net, Gan, FCN and 3D V-net, reaching specificity over 99.8%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14418v1-abstract-full').style.display = 'none'; document.getElementById('2411.14418v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures, Annual Conference on Medical Image Understanding and Analysis (MIUA) 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 15-11 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.6; I.5.4 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Medical Image Understanding and Analysis (MIUA), Lecture Notes in Computer Science, Springer, vol. 14859, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13602">arXiv:2411.13602</a> <span> [<a href="https://arxiv.org/pdf/2411.13602">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale cross-modality pretrained model enhances cardiovascular state estimation and cardiomyopathy detection from electrocardiograms: An AI system development and multi-center validation study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+C">Chengchen Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Ziyu Li</a>, <a href="/search/eess?searchtype=author&query=Mao%2C+Y">Yiheng Mao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Haitao Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Q">Qian Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jing Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+M">Mengjia Chen</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Longbo Wang</a>, <a href="/search/eess?searchtype=author&query=Chu%2C+X">Xuesen Chu</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+W">Weichao Pan</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongkun Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+Z">Zhengxing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13602v1-abstract-short" style="display: inline;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13602v1-abstract-full" style="display: none;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an innovative model that enhances ECG analysis by leveraging the diagnostic strengths of CMR through cross-modal contrastive learning and generative pretraining. CardiacNets serves two primary functions: (1) it evaluates detailed cardiac function indicators and screens for potential CVDs, including coronary artery disease, cardiomyopathy, pericarditis, heart failure and pulmonary hypertension, using ECG input; and (2) it enhances interpretability by generating high-quality CMR images from ECG data. We train and validate the proposed CardiacNets on two large-scale public datasets (the UK Biobank with 41,519 individuals and the MIMIC-IV-ECG comprising 501,172 samples) as well as three private datasets (FAHZU with 410 individuals, SAHZU with 464 individuals, and QPH with 338 individuals), and the findings demonstrate that CardiacNets consistently outperforms traditional ECG-only models, substantially improving screening accuracy. Furthermore, the generated CMR images provide valuable diagnostic support for physicians of all experience levels. This proof-of-concept study highlights how ECG can facilitate cross-modal insights into cardiac function assessment, paving the way for enhanced CVD screening and diagnosis at a population level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'none'; document.getElementById('2411.13602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13456">arXiv:2411.13456</a> <span> [<a href="https://arxiv.org/pdf/2411.13456">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Why Anticipatory Sensing Matters in Commercial ACC Systems under Cut-In Scenarios: A Perspective from Stochastic Safety Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sixu Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zihao Li</a>, <a href="/search/eess?searchtype=author&query=Anis%2C+M">Mohammad Anis</a>, <a href="/search/eess?searchtype=author&query=Lord%2C+D">Dominique Lord</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13456v1-abstract-short" style="display: inline;"> This study presents an analytical solution for the vehicle state evolution of Adaptive Cruise Control (ACC) systems under cut-in scenarios, incorporating sensing delays and anticipation using the Lambert W function. The theoretical analysis demonstrates that the vehicle state evolution and the corresponding safety of ACC in cut-in situations are influenced by multiple factors, including the origin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13456v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13456v1-abstract-full" style="display: none;"> This study presents an analytical solution for the vehicle state evolution of Adaptive Cruise Control (ACC) systems under cut-in scenarios, incorporating sensing delays and anticipation using the Lambert W function. The theoretical analysis demonstrates that the vehicle state evolution and the corresponding safety of ACC in cut-in situations are influenced by multiple factors, including the original leading vehicle's state, the initial conditions of the cut-in vehicle, subsequent cut-in maneuvers, sensing delays, and the ACC's anticipation capabilities. To quantitatively assess these influences, a series of numerical experiments were conducted to perform a stochastic safety analysis of ACC systems, accounting for embedded sensing delays and anticipation, using empirically calibrated control parameters from real-world data. The experiments revealed that the impact of sensing delays on ACC is multifaceted. Specifically, sensing delays negatively affect ACC stability, with the severity increasing as the delay lengthens. Furthermore, collision risk in cut-in scenarios becomes more significant with sensing delays, particularly when the cut-in vehicle is slower than the following vehicle and when cut-ins are aggressive. However, anticipation plays a crucial role in mitigating these risks. Even with a 0.6-second anticipation, collision risk can be reduced by 91% in highly adverse scenarios. Finally, both sensing delays and anticipation have effects that intensify with their duration. An anticipation period of 2 seconds effectively ensures safety in aggressive cut-in conditions, even in the presence of sensing delays. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13456v1-abstract-full').style.display = 'none'; document.getElementById('2411.13456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12985">arXiv:2411.12985</a> <span> [<a href="https://arxiv.org/pdf/2411.12985">pdf</a>, <a href="https://arxiv.org/format/2411.12985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Disco Intelligent Omni-Surfaces: 360-degree Fully-Passive Jamming Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jide Yuan</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+L">Luyao Sun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yitian Wang</a>, <a href="/search/eess?searchtype=author&query=Mei%2C+W">Weidong Mei</a>, <a href="/search/eess?searchtype=author&query=Di%2C+B">Boya Di</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+Y">Yi Cai</a>, <a href="/search/eess?searchtype=author&query=Han%2C+Z">Zhu Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12985v1-abstract-short" style="display: inline;"> Intelligent omni-surfaces (IOSs) with 360-degree electromagnetic radiation significantly improves the performance of wireless systems, while an adversarial IOS also poses a significant potential risk for physical layer security. In this paper, we propose a "DISCO" IOS (DIOS) based fully-passive jammer (FPJ) that can launch omnidirectional fully-passive jamming attacks. In the proposed DIOS-based F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12985v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12985v1-abstract-full" style="display: none;"> Intelligent omni-surfaces (IOSs) with 360-degree electromagnetic radiation significantly improves the performance of wireless systems, while an adversarial IOS also poses a significant potential risk for physical layer security. In this paper, we propose a "DISCO" IOS (DIOS) based fully-passive jammer (FPJ) that can launch omnidirectional fully-passive jamming attacks. In the proposed DIOS-based FPJ, the interrelated refractive and reflective (R&R) coefficients of the adversarial IOS are randomly generated, acting like a "DISCO" that distributes wireless energy radiated by the base station. By introducing active channel aging (ACA) during channel coherence time, the DIOS-based FPJ can perform omnidirectional fully-passive jamming without neither jamming power nor channel knowledge of legitimate users (LUs). To characterize the impact of the DIOS-based PFJ, we derive the statistical characteristics of DIOS-jammed channels based on two widely-used IOS models, i.e., the constant-amplitude model and the variable-amplitude model. Consequently, the asymptotic analysis of the ergodic achievable sum rates under the DIOS-based omnidirectional fully-passive jamming is given based on the derived stochastic characteristics for both the two IOS models. Based on the derived analysis, the omnidirectional jamming impact of the proposed DIOS-based FPJ implemented by a constant-amplitude IOS does not depend on either the quantization number or the stochastic distribution of the DIOS coefficients, while the conclusion does not hold on when a variable-amplitude IOS is used. Numerical results based on one-bit quantization of the IOS phase shifts are provided to verify the effectiveness of the derived theoretical analysis. The proposed DIOS-based FPJ can not only launch omnidirectional fully-passive jamming, but also improve the jamming impact by about 55% at 10 dBm transmit power per LU. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12985v1-abstract-full').style.display = 'none'; document.getElementById('2411.12985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been submitted to IEEE TWC for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09868">arXiv:2411.09868</a> <span> [<a href="https://arxiv.org/pdf/2411.09868">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Phase Transitions with Structured Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huiguang Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+B">Baoguo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09868v1-abstract-short" style="display: inline;"> In the field of signal processing, phase transition phenomena have recently attracted great attention. Donoho's work established the signal recovery threshold using indicators such as restricted isotropy (RIP) and incoherence and proved that phase transition phenomena occur in compressed sampling. Nevertheless, the phase transition phenomenon of structured sparse signals remains unclear, and these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09868v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09868v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09868v1-abstract-full" style="display: none;"> In the field of signal processing, phase transition phenomena have recently attracted great attention. Donoho's work established the signal recovery threshold using indicators such as restricted isotropy (RIP) and incoherence and proved that phase transition phenomena occur in compressed sampling. Nevertheless, the phase transition phenomenon of structured sparse signals remains unclear, and these studies mainly focused on simple sparse signals. Signals with a specific structure, such as the block or tree structures common in real-world applications, are called structured sparse signals. The objectives of this article are to study the phase transition phenomenon of structured sparse signals and to investigate how structured sparse signals affect the phase transition threshold. It begins with a summary of the common subspace of structured sparse signals and the theory of high-dimensional convex polytope random projections. Next, the strong threshold expression of block-structured and tree-structured sparse signals is derived after examining the weak and strong thresholds of structured sparse signals. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09868v1-abstract-full').style.display = 'none'; document.getElementById('2411.09868v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04864">arXiv:2411.04864</a> <span> [<a href="https://arxiv.org/pdf/2411.04864">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Voltage Support Capability Analysis of Grid-Forming Inverters with Current-Limiting Control Under Asymmetrical Grid Faults </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+R">Rui Liu</a>, <a href="/search/eess?searchtype=author&query=Yunwei"> Yunwei</a>, <a href="/search/eess?searchtype=author&query=Li"> Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04864v1-abstract-short" style="display: inline;"> Voltage support capability is critical for grid-forming (GFM) inverters with current-limiting control (CLC) during grid faults. Despite the findings on the voltage support for symmetrical grid faults, its applicability to more common but complex asymmetrical grid faults has yet to be verified rigorously. This letter fills the gap in the voltage support capability analysis for asymmetrical grid fau… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04864v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04864v1-abstract-full" style="display: none;"> Voltage support capability is critical for grid-forming (GFM) inverters with current-limiting control (CLC) during grid faults. Despite the findings on the voltage support for symmetrical grid faults, its applicability to more common but complex asymmetrical grid faults has yet to be verified rigorously. This letter fills the gap in the voltage support capability analysis for asymmetrical grid faults by establishing and analyzing positive- and negative-sequence equivalent circuit models, where the virtual impedance is adopted to emulate various CLCs. It is discovered that matching the phase angle of the virtual impedance, emulated by the CLC, with that of the composed impedance from the capacitor to the fault location can maximize the voltage support capability of GFM inverters under asymmetrical grid faults. Rigorous theoretical analysis and experimental results verify this conclusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04864v1-abstract-full').style.display = 'none'; document.getElementById('2411.04864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04611">arXiv:2411.04611</a> <span> [<a href="https://arxiv.org/pdf/2411.04611">pdf</a>, <a href="https://arxiv.org/format/2411.04611">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Compressive Spectrum Sensing with 1-bit ADCs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zihang Song</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+Y">Yue Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04611v1-abstract-short" style="display: inline;"> Efficient wideband spectrum sensing (WSS) is essential for managing spectrum scarcity in wireless communications. However, existing compressed sensing (CS)-based WSS methods require high sampling rates and power consumption, particularly with high-precision analog-to-digital converters (ADCs). Although 1-bit CS with low-precision ADCs can mitigate these demands, most approaches still depend on mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04611v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04611v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04611v1-abstract-full" style="display: none;"> Efficient wideband spectrum sensing (WSS) is essential for managing spectrum scarcity in wireless communications. However, existing compressed sensing (CS)-based WSS methods require high sampling rates and power consumption, particularly with high-precision analog-to-digital converters (ADCs). Although 1-bit CS with low-precision ADCs can mitigate these demands, most approaches still depend on multi-user cooperation and prior sparsity information, which are often unavailable in WSS scenarios. This paper introduces a non-cooperative WSS method using multicoset sampling with 1-bit ADCs to achieve sub-Nyquist sampling without requiring sparsity knowledge. We analyze the impact of 1-bit quantization on multiband signals, then apply eigenvalue decomposition to isolate the signal subspace from noise, enabling spectrum support estimation without signal reconstruction. This approach provides a power-efficient solution for WSS that eliminates the need for cooperation and prior information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04611v1-abstract-full').style.display = 'none'; document.getElementById('2411.04611v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04494">arXiv:2411.04494</a> <span> [<a href="https://arxiv.org/pdf/2411.04494">pdf</a>, <a href="https://arxiv.org/format/2411.04494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Online Omnidirectional Jumping Trajectory Planning for Quadrupedal Robots on Uneven Terrains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yue%2C+L">Linzhu Yue</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Z">Zhitao Song</a>, <a href="/search/eess?searchtype=author&query=Dong%2C+J">Jinhu Dong</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongbo Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lingwei Zhang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+X">Xuanqi Zeng</a>, <a href="/search/eess?searchtype=author&query=Sreenath%2C+K">Koushil Sreenath</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yun-hui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04494v2-abstract-short" style="display: inline;"> Natural terrain complexity often necessitates agile movements like jumping in animals to improve traversal efficiency. To enable similar capabilities in quadruped robots, complex real-time jumping maneuvers are required. Current research does not adequately address the problem of online omnidirectional jumping and neglects the robot's kinodynamic constraints during trajectory generation. This pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04494v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04494v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04494v2-abstract-full" style="display: none;"> Natural terrain complexity often necessitates agile movements like jumping in animals to improve traversal efficiency. To enable similar capabilities in quadruped robots, complex real-time jumping maneuvers are required. Current research does not adequately address the problem of online omnidirectional jumping and neglects the robot's kinodynamic constraints during trajectory generation. This paper proposes a general and complete cascade online optimization framework for omnidirectional jumping for quadruped robots. Our solution systematically encompasses jumping trajectory generation, a trajectory tracking controller, and a landing controller. It also incorporates environmental perception to navigate obstacles that standard locomotion cannot bypass, such as jumping from high platforms. We introduce a novel jumping plane to parameterize omnidirectional jumping motion and formulate a tightly coupled optimization problem accounting for the kinodynamic constraints, simultaneously optimizing CoM trajectory, Ground Reaction Forces (GRFs), and joint states. To meet the online requirements, we propose an accelerated evolutionary algorithm as the trajectory optimizer to address the complexity of kinodynamic constraints. To ensure stability and accuracy in environmental perception post-landing, we introduce a coarse-to-fine relocalization method that combines global Branch and Bound (BnB) search with Maximum a Posteriori (MAP) estimation for precise positioning during navigation and jumping. The proposed framework achieves jump trajectory generation in approximately 0.1 seconds with a warm start and has been successfully validated on two quadruped robots on uneven terrains. Additionally, we extend the framework's versatility to humanoid robots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04494v2-abstract-full').style.display = 'none'; document.getElementById('2411.04494v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IJRR</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22799">arXiv:2410.22799</a> <span> [<a href="https://arxiv.org/pdf/2410.22799">pdf</a>, <a href="https://arxiv.org/format/2410.22799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> RIS-Aided Dual-Polarized MIMO: How Large a Surface is Needed to Beat Single Polarization? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zheng%2C+Z">Zizhou Zheng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+H">Huan Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a>, <a href="/search/eess?searchtype=author&query=Swindlehurst%2C+A+L">A. Lee Swindlehurst</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22799v1-abstract-short" style="display: inline;"> Dual-polarized (DP) multiple-input-multiple-output (MIMO) systems have been widely adopted in commercial mobile wireless communications. Such systems achieve multiplexing and diversity gain by exploiting the polarization dimension. However, existing studies have shown that the capacity of DP MIMO may not surpass that of single-polarized (SP) MIMO systems due to the cross-polarization coupling indu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22799v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22799v1-abstract-full" style="display: none;"> Dual-polarized (DP) multiple-input-multiple-output (MIMO) systems have been widely adopted in commercial mobile wireless communications. Such systems achieve multiplexing and diversity gain by exploiting the polarization dimension. However, existing studies have shown that the capacity of DP MIMO may not surpass that of single-polarized (SP) MIMO systems due to the cross-polarization coupling induced by the propagation environment. In this letter, we employ reconfigurable intelligent surfaces (RISs) to address this issue and investigate how large the surface should be to ensure a better performance for DP MIMO. Specifically, we first derive the capacities of DP and SP MIMO systems with an RIS, and then study the influence of the RIS size on the system capacity. Our analyses reveal how to deploy the RIS in a DP MIMO scenario. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22799v1-abstract-full').style.display = 'none'; document.getElementById('2410.22799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20812">arXiv:2410.20812</a> <span> [<a href="https://arxiv.org/pdf/2410.20812">pdf</a>, <a href="https://arxiv.org/format/2410.20812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Fidelity-Imposed Displacement Editing for the Learn2Reg 2024 SHG-BF Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiang Chen</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+R">Renjiu Hu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Rongguang Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+M">Min Liu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yaonan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiazheng Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20812v1-abstract-short" style="display: inline;"> Co-examination of second-harmonic generation (SHG) and bright-field (BF) microscopy enables the differentiation of tissue components and collagen fibers, aiding the analysis of human breast and pancreatic cancer tissues. However, large discrepancies between SHG and BF images pose challenges for current learning-based registration models in aligning SHG to BF. In this paper, we propose a novel mult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20812v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20812v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20812v1-abstract-full" style="display: none;"> Co-examination of second-harmonic generation (SHG) and bright-field (BF) microscopy enables the differentiation of tissue components and collagen fibers, aiding the analysis of human breast and pancreatic cancer tissues. However, large discrepancies between SHG and BF images pose challenges for current learning-based registration models in aligning SHG to BF. In this paper, we propose a novel multi-modal registration framework that employs fidelity-imposed displacement editing to address these challenges. The framework integrates batch-wise contrastive learning, feature-based pre-alignment, and instance-level optimization. Experimental results from the Learn2Reg COMULISglobe SHG-BF Challenge validate the effectiveness of our method, securing the 1st place on the online leaderboard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20812v1-abstract-full').style.display = 'none'; document.getElementById('2410.20812v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20374">arXiv:2410.20374</a> <span> [<a href="https://arxiv.org/pdf/2410.20374">pdf</a>, <a href="https://arxiv.org/format/2410.20374">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A CT-guided Control Framework of a Robotic Flexible Endoscope for the Diagnosis of the Maxillary Sinusitis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhu%2C+P">Puchen Zhu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huayu Zhang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+X">Xin Ma</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+X">Xiaoyin Zheng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuchen Wang</a>, <a href="/search/eess?searchtype=author&query=Au%2C+K+W+S">Kwok Wai Samuel Au</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20374v1-abstract-short" style="display: inline;"> Flexible endoscopes are commonly adopted in narrow and confined anatomical cavities due to their higher reachability and dexterity. However, prolonged and unintuitive manipulation of these endoscopes leads to an increased workload on surgeons and risks of collision. To address these challenges, this paper proposes a CT-guided control framework for the diagnosis of maxillary sinusitis by using a ro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20374v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20374v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20374v1-abstract-full" style="display: none;"> Flexible endoscopes are commonly adopted in narrow and confined anatomical cavities due to their higher reachability and dexterity. However, prolonged and unintuitive manipulation of these endoscopes leads to an increased workload on surgeons and risks of collision. To address these challenges, this paper proposes a CT-guided control framework for the diagnosis of maxillary sinusitis by using a robotic flexible endoscope. In the CT-guided control framework, a feasible path to the target position in the maxillary sinus cavity for the robotic flexible endoscope is designed. Besides, an optimal control scheme is proposed to autonomously control the robotic flexible endoscope to follow the feasible path. This greatly improves the efficiency and reduces the workload for surgeons. Several experiments were conducted based on a widely utilized sinus phantom, and the results showed that the robotic flexible endoscope can accurately and autonomously follow the feasible path and reach the target position in the maxillary sinus cavity. The results also verified the feasibility of the CT-guided control framework, which contributes an effective approach to early diagnosis of sinusitis in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20374v1-abstract-full').style.display = 'none'; document.getElementById('2410.20374v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20326">arXiv:2410.20326</a> <span> [<a href="https://arxiv.org/pdf/2410.20326">pdf</a>, <a href="https://arxiv.org/format/2410.20326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> SEEV: Synthesis with Efficient Exact Verification for ReLU Neural Barrier Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongchao Zhang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+Z">Zhizhen Qin</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Sicun Gao</a>, <a href="/search/eess?searchtype=author&query=Clark%2C+A">Andrew Clark</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20326v1-abstract-short" style="display: inline;"> Neural Control Barrier Functions (NCBFs) have shown significant promise in enforcing safety constraints on nonlinear autonomous systems. State-of-the-art exact approaches to verifying safety of NCBF-based controllers exploit the piecewise-linear structure of ReLU neural networks, however, such approaches still rely on enumerating all of the activation regions of the network near the safety boundar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20326v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20326v1-abstract-full" style="display: none;"> Neural Control Barrier Functions (NCBFs) have shown significant promise in enforcing safety constraints on nonlinear autonomous systems. State-of-the-art exact approaches to verifying safety of NCBF-based controllers exploit the piecewise-linear structure of ReLU neural networks, however, such approaches still rely on enumerating all of the activation regions of the network near the safety boundary, thus incurring high computation cost. In this paper, we propose a framework for Synthesis with Efficient Exact Verification (SEEV). Our framework consists of two components, namely (i) an NCBF synthesis algorithm that introduces a novel regularizer to reduce the number of activation regions at the safety boundary, and (ii) a verification algorithm that exploits tight over-approximations of the safety conditions to reduce the cost of verifying each piecewise-linear segment. Our simulations show that SEEV significantly improves verification efficiency while maintaining the CBF quality across various benchmark systems and neural network structures. Our code is available at https://github.com/HongchaoZhang-HZ/SEEV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20326v1-abstract-full').style.display = 'none'; document.getElementById('2410.20326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20288">arXiv:2410.20288</a> <span> [<a href="https://arxiv.org/pdf/2410.20288">pdf</a>, <a href="https://arxiv.org/format/2410.20288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Who is Responsible? Explaining Safety Violations in Multi-Agent Cyber-Physical Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Niu%2C+L">Luyao Niu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongchao Zhang</a>, <a href="/search/eess?searchtype=author&query=Sahabandu%2C+D">Dinuka Sahabandu</a>, <a href="/search/eess?searchtype=author&query=Ramasubramanian%2C+B">Bhaskar Ramasubramanian</a>, <a href="/search/eess?searchtype=author&query=Clark%2C+A">Andrew Clark</a>, <a href="/search/eess?searchtype=author&query=Poovendran%2C+R">Radha Poovendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20288v1-abstract-short" style="display: inline;"> Multi-agent cyber-physical systems are present in a variety of applications. Agent decision-making can be affected due to errors induced by uncertain, dynamic operating environments or due to incorrect actions taken by an agent. When an erroneous decision that leads to a violation of safety is identified, assigning responsibility to individual agents is a key step toward preventing future accident… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20288v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20288v1-abstract-full" style="display: none;"> Multi-agent cyber-physical systems are present in a variety of applications. Agent decision-making can be affected due to errors induced by uncertain, dynamic operating environments or due to incorrect actions taken by an agent. When an erroneous decision that leads to a violation of safety is identified, assigning responsibility to individual agents is a key step toward preventing future accidents. Current approaches to carrying out such investigations require human labor or high degree of familiarity with operating environments. Automated strategies to assign responsibility can achieve a significant reduction in human effort and associated cognitive burden. In this paper, we develop an automated procedure to assign responsibility for safety violations to actions of any single agent in a principled manner. We base our approach on reasoning about safety violations in road safety. Given a safety violation, we use counterfactual reasoning to create alternative scenarios, showing how different outcomes could have occurred if certain actions had been replaced by others. We introduce the degree of responsibility (DoR) metric for each agent. The DoR, using the Shapley value, quantifies each agent's contribution to the safety violation, providing a basis to explain and justify decisions. We also develop heuristic techniques and methods based on agent interaction structures to improve scalability as agent numbers grow. We examine three safety violation cases from the National Highway Traffic Safety Administration (NHTSA). We run experiments using CARLA urban driving simulator. Results show the DoR improves the explainability of decisions and accountability for agent actions and their consequences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20288v1-abstract-full').style.display = 'none'; document.getElementById('2410.20288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19859">arXiv:2410.19859</a> <span> [<a href="https://arxiv.org/pdf/2410.19859">pdf</a>, <a href="https://arxiv.org/ps/2410.19859">ps</a>, <a href="https://arxiv.org/format/2410.19859">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Transformer and Reinforcement Learning-based Beam Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ghassemi%2C+M">Mohammad Ghassemi</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/eess?searchtype=author&query=Afana%2C+A">Ali Afana</a>, <a href="/search/eess?searchtype=author&query=Sediq%2C+A+B">Akram Bin Sediq</a>, <a href="/search/eess?searchtype=author&query=Erol-Kantarci%2C+M">Melike Erol-Kantarci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19859v1-abstract-short" style="display: inline;"> Beam management is an important technique to improve signal strength and reduce interference in wireless communication systems. Recently, there has been increasing interest in using diverse sensing modalities for beam management. However, it remains a big challenge to process multi-modal data efficiently and extract useful information. On the other hand, the recently emerging multi-modal transform… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19859v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19859v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19859v1-abstract-full" style="display: none;"> Beam management is an important technique to improve signal strength and reduce interference in wireless communication systems. Recently, there has been increasing interest in using diverse sensing modalities for beam management. However, it remains a big challenge to process multi-modal data efficiently and extract useful information. On the other hand, the recently emerging multi-modal transformer (MMT) is a promising technique that can process multi-modal data by capturing long-range dependencies. While MMT is highly effective in handling multi-modal data and providing robust beam management, integrating reinforcement learning (RL) further enhances their adaptability in dynamic environments. In this work, we propose a two-step beam management method by combining MMT with RL for dynamic beam index prediction. In the first step, we divide available beam indices into several groups and leverage MMT to process diverse data modalities to predict the optimal beam group. In the second step, we employ RL for fast beam decision-making within each group, which in return maximizes throughput. Our proposed framework is tested on a 6G dataset. In this testing scenario, it achieves higher beam prediction accuracy and system throughput compared to both the MMT-only based method and the RL-only based method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19859v1-abstract-full').style.display = 'none'; document.getElementById('2410.19859v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, IEEE Networking Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19811">arXiv:2410.19811</a> <span> [<a href="https://arxiv.org/pdf/2410.19811">pdf</a>, <a href="https://arxiv.org/format/2410.19811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> ControlAgent: Automating Control System Design via Novel Integration of LLM Agents and Domain Expertise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+X">Xingang Guo</a>, <a href="/search/eess?searchtype=author&query=Keivan%2C+D">Darioush Keivan</a>, <a href="/search/eess?searchtype=author&query=Syed%2C+U">Usman Syed</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+L">Lianhui Qin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/eess?searchtype=author&query=Dullerud%2C+G">Geir Dullerud</a>, <a href="/search/eess?searchtype=author&query=Seiler%2C+P">Peter Seiler</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+B">Bin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19811v1-abstract-short" style="display: inline;"> Control system design is a crucial aspect of modern engineering with far-reaching applications across diverse sectors including aerospace, automotive systems, power grids, and robotics. Despite advances made by Large Language Models (LLMs) in various domains, their application in control system design remains limited due to the complexity and specificity of control theory. To bridge this gap, we i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19811v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19811v1-abstract-full" style="display: none;"> Control system design is a crucial aspect of modern engineering with far-reaching applications across diverse sectors including aerospace, automotive systems, power grids, and robotics. Despite advances made by Large Language Models (LLMs) in various domains, their application in control system design remains limited due to the complexity and specificity of control theory. To bridge this gap, we introduce ControlAgent, a new paradigm that automates control system design via novel integration of LLM agents and control-oriented domain expertise. ControlAgent encodes expert control knowledge and emulates human iterative design processes by gradually tuning controller parameters to meet user-specified requirements for stability, performance, and robustness. ControlAgent integrates multiple collaborative LLM agents, including a central agent responsible for task distribution and task-specific agents dedicated to detailed controller design for various types of systems and requirements. ControlAgent also employs a Python computation agent that performs complex calculations and controller evaluations based on standard design information provided by task-specified LLM agents. Combined with a history and feedback module, the task-specific LLM agents iteratively refine controller parameters based on real-time feedback from prior designs. Overall, ControlAgent mimics the design processes used by (human) practicing engineers, but removes all the human efforts and can be run in a fully automated way to give end-to-end solutions for control system design with user-specified requirements. To validate ControlAgent's effectiveness, we develop ControlEval, an evaluation dataset that comprises 500 control tasks with various specific design goals. The effectiveness of ControlAgent is demonstrated via extensive comparative evaluations between LLM-based and traditional human-involved toolbox-based baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19811v1-abstract-full').style.display = 'none'; document.getElementById('2410.19811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17209">arXiv:2410.17209</a> <span> [<a href="https://arxiv.org/pdf/2410.17209">pdf</a>, <a href="https://arxiv.org/format/2410.17209">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Audio-to-Score Conversion Model Based on Whisper methodology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongyao Zhang</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+B">Bohang Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17209v1-abstract-short" style="display: inline;"> This thesis develops a Transformer model based on Whisper, which extracts melodies and chords from music audio and records them into ABC notation. A comprehensive data processing workflow is customized for ABC notation, including data cleansing, formatting, and conversion, and a mutation mechanism is implemented to increase the diversity and quality of training data. This thesis innovatively intro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17209v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17209v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17209v1-abstract-full" style="display: none;"> This thesis develops a Transformer model based on Whisper, which extracts melodies and chords from music audio and records them into ABC notation. A comprehensive data processing workflow is customized for ABC notation, including data cleansing, formatting, and conversion, and a mutation mechanism is implemented to increase the diversity and quality of training data. This thesis innovatively introduces the "Orpheus' Score", a custom notation system that converts music information into tokens, designs a custom vocabulary library, and trains a corresponding custom tokenizer. Experiments show that compared to traditional algorithms, the model has significantly improved accuracy and performance. While providing a convenient audio-to-score tool for music enthusiasts, this work also provides new ideas and tools for research in music information processing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17209v1-abstract-full').style.display = 'none'; document.getElementById('2410.17209v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15614">arXiv:2410.15614</a> <span> [<a href="https://arxiv.org/pdf/2410.15614">pdf</a>, <a href="https://arxiv.org/format/2410.15614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Topology-Aware Exploration of Circle of Willis for CTA and MRA: Segmentation, Detection, and Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+M">Minghui Zhang</a>, <a href="/search/eess?searchtype=author&query=You%2C+X">Xin You</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanxiao Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yun Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15614v1-abstract-short" style="display: inline;"> The Circle of Willis (CoW) vessels is critical to connecting major circulations of the brain. The topology of the vascular structure is clinical significance to evaluate the risk, severity of the neuro-vascular diseases. The CoW has two representative angiographic imaging modalities, computed tomography angiography (CTA) and magnetic resonance angiography (MRA). TopCow24 provided 125 paired CTA-MR… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15614v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15614v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15614v1-abstract-full" style="display: none;"> The Circle of Willis (CoW) vessels is critical to connecting major circulations of the brain. The topology of the vascular structure is clinical significance to evaluate the risk, severity of the neuro-vascular diseases. The CoW has two representative angiographic imaging modalities, computed tomography angiography (CTA) and magnetic resonance angiography (MRA). TopCow24 provided 125 paired CTA-MRA dataset for the analysis of CoW. To explore both CTA and MRA images in a unified framework to learn the inherent topology of Cow, we construct the universal dataset via independent intensity preprocess, followed by joint resampling and normarlization. Then, we utilize the topology-aware loss to enhance the topology completeness of the CoW and the discrimination between different classes. A complementary topology-aware refinement is further conducted to enhance the connectivity within the same class. Our method was evaluated on all the three tasks and two modalities, achieving competitive results. In the final test phase of TopCow24 Challenge, we achieved the second place in the CTA-Seg-Task, the third palce in the CTA-Box-Task, the first place in the CTA-Edg-Task, the second place in the MRA-Seg-Task, the third palce in the MRA-Box-Task, the second place in the MRA-Edg-Task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15614v1-abstract-full').style.display = 'none'; document.getElementById('2410.15614v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Participation technical report for TopCoW24 challenge @ MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13312">arXiv:2410.13312</a> <span> [<a href="https://arxiv.org/pdf/2410.13312">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Windowed Compressed Spectrum Sensing with Block sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huiguang Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+B">Baoguo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13312v1-abstract-short" style="display: inline;"> Compressed Spectrum Sensing (CSS) is widely employed in spectral analysis due to its sampling efficiency. However, conventional CSS assumes a standard sparse spectrum, which is affected by Spectral Leakage (SL). Despite the widespread use of CSS, the impact of SL on its performance has not been systematically and thoroughly investigated. This study addresses this research gap by analyzing the Rest… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13312v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13312v1-abstract-full" style="display: none;"> Compressed Spectrum Sensing (CSS) is widely employed in spectral analysis due to its sampling efficiency. However, conventional CSS assumes a standard sparse spectrum, which is affected by Spectral Leakage (SL). Despite the widespread use of CSS, the impact of SL on its performance has not been systematically and thoroughly investigated. This study addresses this research gap by analyzing the Restricted Isometry Property (RIP) of windowed Gaussian measurement matrices and proposing a novel block-sparse CSS model. We introduce the Edge Zeroing Coefficient (EZC) to evaluate SL suppression and RIP impact, and the Window Scaling Coefficient (WSC) to quantify the effect on RIP. Our research investigates the influence of Window Function (WF) on signal sparsity and measurement matrices, and presents a block-sparse CSS model that considers component frequency distribution, signal length, windowing, and noise floor. Based on subspace counting theory, we derive sample bound for our model. The findings demonstrate that while WFs reduce SL, excessively small EZC and WSC values can negatively affect RIP quality and cause numerical instability during signal reconstruction. This highlights the delicate balance required when applying WFs in CSS. Our block-sparse approach enables precise compression and reconstruction, particularly for high noise floor and super-sparse signals. This study provides a framework for optimizing CSS performance when dealing with SL and sparse signals, offering insights for improving signal reconstruction quality in various applications <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13312v1-abstract-full').style.display = 'none'; document.getElementById('2410.13312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">36 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12320">arXiv:2410.12320</a> <span> [<a href="https://arxiv.org/pdf/2410.12320">pdf</a>, <a href="https://arxiv.org/format/2410.12320">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Hierarchical DRL Approach for Resource Optimization in Multi-RIS Multi-Operator Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haocheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Hao Zhou</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+Z">Zhiping Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+M">Ming Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12320v1-abstract-short" style="display: inline;"> As reconfigurable intelligent surfaces (RIS) emerge as a pivotal technology in the upcoming sixth-generation (6G) networks, their deployment within practical multiple operator (OP) networks presents significant challenges, including the coordination of RIS configurations among OPs, interference management, and privacy maintenance. A promising strategy is to treat RIS as a public resource managed b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12320v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12320v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12320v1-abstract-full" style="display: none;"> As reconfigurable intelligent surfaces (RIS) emerge as a pivotal technology in the upcoming sixth-generation (6G) networks, their deployment within practical multiple operator (OP) networks presents significant challenges, including the coordination of RIS configurations among OPs, interference management, and privacy maintenance. A promising strategy is to treat RIS as a public resource managed by an RIS provider (RP), which can enhance resource allocation efficiency by allowing dynamic access for multiple OPs. However, the intricate nature of coordinating management and optimizing RIS configurations significantly complicates the implementation process. In this paper, we propose a hierarchical deep reinforcement learning (HDRL) approach that decomposes the complicated RIS resource optimization problem into several subtasks. Specifically, a top-level RP-agent is responsible for RIS allocation, while low-level OP-agents control their assigned RISs and handle beamforming, RIS phase-shifts, and user association. By utilizing the semi-Markov decision process (SMDP) theory, we establish a sophisticated interaction mechanism between the RP and OPs, and introduce an advanced hierarchical proximal policy optimization (HPPO) algorithm. Furthermore, we propose an improved sequential-HPPO (S-HPPO) algorithm to address the curse of dimensionality encountered with a single RP-agent. Experimental results validate the stability of the HPPO algorithm across various environmental parameters, demonstrating its superiority over other benchmarks for joint resource optimization. Finally, we conduct a detailed comparative analysis between the proposed S-HPPO and HPPO algorithms, showcasing that the S-HPPO algorithm achieves faster convergence and improved performance in large-scale RIS allocation scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12320v1-abstract-full').style.display = 'none'; document.getElementById('2410.12320v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11181">arXiv:2410.11181</a> <span> [<a href="https://arxiv.org/pdf/2410.11181">pdf</a>, <a href="https://arxiv.org/format/2410.11181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> DARNet: Dual Attention Refinement Network with Spatiotemporal Construction for Auditory Attention Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+S">Sheng Yan</a>, <a href="/search/eess?searchtype=author&query=fan%2C+C">Cunhang fan</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+X">Xiaoke Yang</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Lv%2C+Z">Zhao Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11181v2-abstract-short" style="display: inline;"> At a cocktail party, humans exhibit an impressive ability to direct their attention. The auditory attention detection (AAD) approach seeks to identify the attended speaker by analyzing brain signals, such as EEG signals. However, current AAD algorithms overlook the spatial distribution information within EEG signals and lack the ability to capture long-range latent dependencies, limiting the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11181v2-abstract-full').style.display = 'inline'; document.getElementById('2410.11181v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11181v2-abstract-full" style="display: none;"> At a cocktail party, humans exhibit an impressive ability to direct their attention. The auditory attention detection (AAD) approach seeks to identify the attended speaker by analyzing brain signals, such as EEG signals. However, current AAD algorithms overlook the spatial distribution information within EEG signals and lack the ability to capture long-range latent dependencies, limiting the model's ability to decode brain activity. To address these issues, this paper proposes a dual attention refinement network with spatiotemporal construction for AAD, named DARNet, which consists of the spatiotemporal construction module, dual attention refinement module, and feature fusion \& classifier module. Specifically, the spatiotemporal construction module aims to construct more expressive spatiotemporal feature representations, by capturing the spatial distribution characteristics of EEG signals. The dual attention refinement module aims to extract different levels of temporal patterns in EEG signals and enhance the model's ability to capture long-range latent dependencies. The feature fusion \& classifier module aims to aggregate temporal patterns and dependencies from different levels and obtain the final classification results. The experimental results indicate that compared to the state-of-the-art models, DARNet achieves an average classification accuracy improvement of 5.9\% for 0.1s, 4.6\% for 1s, and 3.9\% for 2s on the DTU dataset. While maintaining excellent classification performance, DARNet significantly reduces the number of required parameters. Compared to the state-of-the-art models, DARNet reduces the parameter count by 91\%. Code is available at: https://github.com/fchest/DARNet.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11181v2-abstract-full').style.display = 'none'; document.getElementById('2410.11181v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10676">arXiv:2410.10676</a> <span> [<a href="https://arxiv.org/pdf/2410.10676">pdf</a>, <a href="https://arxiv.org/format/2410.10676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Both Ears Wide Open: Towards Language-Driven Spatial Audio Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Sun%2C+P">Peiwen Sun</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+S">Sitong Cheng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+X">Xiangtai Li</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+Z">Zhen Ye</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+H">Huadai Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Honggang Zhang</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+W">Wei Xue</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Yike Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10676v1-abstract-short" style="display: inline;"> Recently, diffusion models have achieved great success in mono-channel audio generation. However, when it comes to stereo audio generation, the soundscapes often have a complex scene of multiple objects and directions. Controlling stereo audio with spatial contexts remains challenging due to high data costs and unstable generative models. To the best of our knowledge, this work represents the firs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10676v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10676v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10676v1-abstract-full" style="display: none;"> Recently, diffusion models have achieved great success in mono-channel audio generation. However, when it comes to stereo audio generation, the soundscapes often have a complex scene of multiple objects and directions. Controlling stereo audio with spatial contexts remains challenging due to high data costs and unstable generative models. To the best of our knowledge, this work represents the first attempt to address these issues. We first construct a large-scale, simulation-based, and GPT-assisted dataset, BEWO-1M, with abundant soundscapes and descriptions even including moving and multiple sources. Beyond text modality, we have also acquired a set of images and rationally paired stereo audios through retrieval to advance multimodal generation. Existing audio generation models tend to generate rather random and indistinct spatial audio. To provide accurate guidance for latent diffusion models, we introduce the SpatialSonic model utilizing spatial-aware encoders and azimuth state matrices to reveal reasonable spatial guidance. By leveraging spatial guidance, our unified model not only achieves the objective of generating immersive and controllable spatial audio from text and image but also enables interactive audio generation during inference. Finally, under fair settings, we conduct subjective and objective evaluations on simulated and real-world data to compare our approach with prevailing methods. The results demonstrate the effectiveness of our method, highlighting its capability to generate spatial audio that adheres to physical rules. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10676v1-abstract-full').style.display = 'none'; document.getElementById('2410.10676v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10265">arXiv:2410.10265</a> <span> [<a href="https://arxiv.org/pdf/2410.10265">pdf</a>, <a href="https://arxiv.org/format/2410.10265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> FSOS-AMC: Few-Shot Open-Set Learning for Automatic Modulation Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+F">Fuhui Zhou</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Q">Qihui Wu</a>, <a href="/search/eess?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10265v1-abstract-short" style="display: inline;"> Automatic modulation classification (AMC) is essential for the advancement and efficiency of future wireless communication networks. Deep learning (DL)-based AMC frameworks have garnered extensive attention for their impressive classification performance. However, existing DL-based AMC frameworks rely on two assumptions, large-scale training data and the same class pool between the training and te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10265v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10265v1-abstract-full" style="display: none;"> Automatic modulation classification (AMC) is essential for the advancement and efficiency of future wireless communication networks. Deep learning (DL)-based AMC frameworks have garnered extensive attention for their impressive classification performance. However, existing DL-based AMC frameworks rely on two assumptions, large-scale training data and the same class pool between the training and testing data, which are not suitable for \emph{few-shot and open-set} scenarios. To address this issue, a novel few-shot open-set automatic modulation classification (FSOS-AMC) framework is proposed by exploiting a multi-scale attention network, meta-prototype training, and a modular open-set classifier. The multi-scale attention network is used to extract the features from the input signal, the meta-prototype training is adopted to train the feature extractor and the modular open-set classifier can be utilized to classify the testing data into one of the known modulations or potential unknown modulations. Extensive simulation results demonstrate that the proposed FSOS-AMC framework can achieve higher classification accuracy than the state-of-the-art methods for known modulations and unknown modulations in terms of accuracy and area under the receiver operating characteristic curve (AUROC). Moreover, the performance of the proposed FSOS-AMC framework under low signal-to-noise ratio (SNR) conditions is much better than the compared schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10265v1-abstract-full').style.display = 'none'; document.getElementById('2410.10265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by 16th International Conference on Wireless Communications and Signal Processing (WCSP 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05013">arXiv:2410.05013</a> <span> [<a href="https://arxiv.org/pdf/2410.05013">pdf</a>, <a href="https://arxiv.org/format/2410.05013">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Degrees of Freedom of Holographic MIMO in Multi-user Near-field Channels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Houfeng Chen</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+S">Shaohua Yue</a>, <a href="/search/eess?searchtype=author&query=Di+Renzo%2C+M">Marco Di Renzo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05013v1-abstract-short" style="display: inline;"> Holographic multiple-input multiple-output (HMIMO) is an emerging technology for 6G communications, in which numerous antenna units are integrated in a limited space. As the HMIMO array aperture expands, the near-field region of the array is dramatically enlarged, resulting in more users being located in the near-field region. This creates new opportunities for wireless communications. In this con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05013v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05013v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05013v1-abstract-full" style="display: none;"> Holographic multiple-input multiple-output (HMIMO) is an emerging technology for 6G communications, in which numerous antenna units are integrated in a limited space. As the HMIMO array aperture expands, the near-field region of the array is dramatically enlarged, resulting in more users being located in the near-field region. This creates new opportunities for wireless communications. In this context, the evaluation of the spatial degrees of freedom (DoF) of HMIMO multi-user systems in near-field channels is an open problem, as the methods of analysis utilized for evaluating the DoF in far-field channels cannnot be directly applied due to the different propagation characteristics. In this paper, we propose a novel method to calculate the DoF of HMIMO in multi-user near-field channels. We first derive the DoF for a single user in the near field, and then extend the analysis to multi-user scenarios. In this latter scenario, we focus on the impact of spatial blocking between HMIMO users. The derived analytical framework reveals that the DoF of HMIMO in multi-user near-field channels is not in general given by the sum of the DoF of the HMIMO single-user setting. Simulation results demonstrate that the proposed method can accurately estimate the DoF in HMIMO multi-user near-field channels in the presence of spatial blocking. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05013v1-abstract-full').style.display = 'none'; document.getElementById('2410.05013v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5pages,5figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04081">arXiv:2410.04081</a> <span> [<a href="https://arxiv.org/pdf/2410.04081">pdf</a>, <a href="https://arxiv.org/format/2410.04081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> $蔚$-VAE: Denoising as Visual Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhao%2C+L">Long Zhao</a>, <a href="/search/eess?searchtype=author&query=Woo%2C+S">Sanghyun Woo</a>, <a href="/search/eess?searchtype=author&query=Wan%2C+Z">Ziyu Wan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yandong Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Han Zhang</a>, <a href="/search/eess?searchtype=author&query=Gong%2C+B">Boqing Gong</a>, <a href="/search/eess?searchtype=author&query=Adam%2C+H">Hartwig Adam</a>, <a href="/search/eess?searchtype=author&query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04081v1-abstract-short" style="display: inline;"> In generative modeling, tokenization simplifies complex data into compact, structured representations, creating a more efficient, learnable space. For high-dimensional visual data, it reduces redundancy and emphasizes key features for high-quality generation. Current visual tokenization methods rely on a traditional autoencoder framework, where the encoder compresses data into latent representatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04081v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04081v1-abstract-full" style="display: none;"> In generative modeling, tokenization simplifies complex data into compact, structured representations, creating a more efficient, learnable space. For high-dimensional visual data, it reduces redundancy and emphasizes key features for high-quality generation. Current visual tokenization methods rely on a traditional autoencoder framework, where the encoder compresses data into latent representations, and the decoder reconstructs the original input. In this work, we offer a new perspective by proposing denoising as decoding, shifting from single-step reconstruction to iterative refinement. Specifically, we replace the decoder with a diffusion process that iteratively refines noise to recover the original image, guided by the latents provided by the encoder. We evaluate our approach by assessing both reconstruction (rFID) and generation quality (FID), comparing it to state-of-the-art autoencoding approach. We hope this work offers new insights into integrating iterative generation and autoencoding for improved compression and generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04081v1-abstract-full').style.display = 'none'; document.getElementById('2410.04081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03139">arXiv:2410.03139</a> <span> [<a href="https://arxiv.org/pdf/2410.03139">pdf</a>, <a href="https://arxiv.org/format/2410.03139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> How does the teacher rate? Observations from the NeuroPiano dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheung%2C+V">Vincent Cheung</a>, <a href="/search/eess?searchtype=author&query=Nishioka%2C+H">Hayato Nishioka</a>, <a href="/search/eess?searchtype=author&query=Dixon%2C+S">Simon Dixon</a>, <a href="/search/eess?searchtype=author&query=Furuya%2C+S">Shinichi Furuya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03139v1-abstract-short" style="display: inline;"> This paper provides a detailed analysis of the NeuroPiano dataset, which comprise 104 audio recordings of student piano performances accompanied with 2255 textual feedback and ratings given by professional pianists. We offer a statistical overview of the dataset, focusing on the standardization of annotations and inter-annotator agreement across 12 evaluative questions concerning performance quali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03139v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03139v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03139v1-abstract-full" style="display: none;"> This paper provides a detailed analysis of the NeuroPiano dataset, which comprise 104 audio recordings of student piano performances accompanied with 2255 textual feedback and ratings given by professional pianists. We offer a statistical overview of the dataset, focusing on the standardization of annotations and inter-annotator agreement across 12 evaluative questions concerning performance quality. We also explore the predictive relationship between audio features and teacher ratings via machine learning, as well as annotations provided for text analysis of the responses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03139v1-abstract-full').style.display = 'none'; document.getElementById('2410.03139v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02169">arXiv:2410.02169</a> <span> [<a href="https://arxiv.org/pdf/2410.02169">pdf</a>, <a href="https://arxiv.org/ps/2410.02169">ps</a>, <a href="https://arxiv.org/format/2410.02169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Simulation Results of Center-Manifold-Based Identification of Polynomial Nonlinear Systems with Uncontrollable Linearization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhuping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02169v1-abstract-short" style="display: inline;"> Recently, a system identification method based on center manifold is proposed to identify polynomial nonlinear systems with uncontrollable linearization. This note presents a numerical example to show the effectiveness of this method. </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02169v1-abstract-full" style="display: none;"> Recently, a system identification method based on center manifold is proposed to identify polynomial nonlinear systems with uncontrollable linearization. This note presents a numerical example to show the effectiveness of this method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02169v1-abstract-full').style.display = 'none'; document.getElementById('2410.02169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01150">arXiv:2410.01150</a> <span> [<a href="https://arxiv.org/pdf/2410.01150">pdf</a>, <a href="https://arxiv.org/format/2410.01150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Restorative Speech Enhancement: A Progressive Approach Using SE and Codec Modules </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chiang%2C+H">Hsin-Tien Chiang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+M">Meng Yu</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01150v1-abstract-short" style="display: inline;"> In challenging environments with significant noise and reverberation, traditional speech enhancement (SE) methods often lead to over-suppressed speech, creating artifacts during listening and harming downstream tasks performance. To overcome these limitations, we propose a novel approach called Restorative SE (RestSE), which combines a lightweight SE module with a generative codec module to progre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01150v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01150v1-abstract-full" style="display: none;"> In challenging environments with significant noise and reverberation, traditional speech enhancement (SE) methods often lead to over-suppressed speech, creating artifacts during listening and harming downstream tasks performance. To overcome these limitations, we propose a novel approach called Restorative SE (RestSE), which combines a lightweight SE module with a generative codec module to progressively enhance and restore speech quality. The SE module initially reduces noise, while the codec module subsequently performs dereverberation and restores speech using generative capabilities. We systematically explore various quantization techniques within the codec module to optimize performance. Additionally, we introduce a weighted loss function and feature fusion that merges the SE output with the original mixture, particularly at segments where the SE output is heavily distorted. Experimental results demonstrate the effectiveness of our proposed method in enhancing speech quality under adverse conditions. Audio demos are available at: https://sophie091524.github.io/RestorativeSE/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01150v1-abstract-full').style.display = 'none'; document.getElementById('2410.01150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper in submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17996">arXiv:2409.17996</a> <span> [<a href="https://arxiv.org/pdf/2409.17996">pdf</a>, <a href="https://arxiv.org/format/2409.17996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cai%2C+X">Xin Cai</a>, <a href="/search/eess?searchtype=author&query=You%2C+Z">Zhiyuan You</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hailong Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wentao Liu</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17996v2-abstract-short" style="display: inline;"> Lensless cameras offer significant advantages in size, weight, and cost compared to traditional lens-based systems. Without a focusing lens, lensless cameras rely on computational algorithms to recover the scenes from multiplexed measurements. However, current algorithms struggle with inaccurate forward imaging models and insufficient priors to reconstruct high-quality images. To overcome these li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17996v2-abstract-full').style.display = 'inline'; document.getElementById('2409.17996v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17996v2-abstract-full" style="display: none;"> Lensless cameras offer significant advantages in size, weight, and cost compared to traditional lens-based systems. Without a focusing lens, lensless cameras rely on computational algorithms to recover the scenes from multiplexed measurements. However, current algorithms struggle with inaccurate forward imaging models and insufficient priors to reconstruct high-quality images. To overcome these limitations, we introduce a novel two-stage approach for consistent and photorealistic lensless image reconstruction. The first stage of our approach ensures data consistency by focusing on accurately reconstructing the low-frequency content with a spatially varying deconvolution method that adjusts to changes in the Point Spread Function (PSF) across the camera's field of view. The second stage enhances photorealism by incorporating a generative prior from pre-trained diffusion models. By conditioning on the low-frequency content retrieved in the first stage, the diffusion model effectively reconstructs the high-frequency details that are typically lost in the lensless imaging process, while also maintaining image fidelity. Our method achieves a superior balance between data fidelity and visual quality compared to existing methods, as demonstrated with two popular lensless systems, PhlatCam and DiffuserCam. Project website: https://phocolens.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17996v2-abstract-full').style.display = 'none'; document.getElementById('2409.17996v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16637">arXiv:2409.16637</a> <span> [<a href="https://arxiv.org/pdf/2409.16637">pdf</a>, <a href="https://arxiv.org/ps/2409.16637">ps</a>, <a href="https://arxiv.org/format/2409.16637">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep-Learning Recognition of Scanning Transmission Electron Microscopy: Quantifying and Mitigating the Influence of Gaussian Noises </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanlei Zhang</a>, <a href="/search/eess?searchtype=author&query=Bai%2C+J">Jincheng Bai</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xiabo Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Can Li</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+C">Chuanjian Zhong</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+J">Jiye Fang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+G">Guangwen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16637v1-abstract-short" style="display: inline;"> Scanning transmission electron microscopy (STEM) is a powerful tool to reveal the morphologies and structures of materials, thereby attracting intensive interests from the scientific and industrial communities. The outstanding spatial (atomic level) and temporal (ms level) resolutions of the STEM techniques generate fruitful amounts of high-definition data, thereby enabling the high-volume and hig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16637v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16637v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16637v1-abstract-full" style="display: none;"> Scanning transmission electron microscopy (STEM) is a powerful tool to reveal the morphologies and structures of materials, thereby attracting intensive interests from the scientific and industrial communities. The outstanding spatial (atomic level) and temporal (ms level) resolutions of the STEM techniques generate fruitful amounts of high-definition data, thereby enabling the high-volume and high-speed analysis of materials. On the other hand, processing of the big dataset generated by STEM is time-consuming and beyond the capability of human-based manual work, which urgently calls for computer-based automation. In this work, we present a deep-learning mask region-based neural network (Mask R-CNN) for the recognition of nanoparticles imaged by STEM, as well as generating the associated dimensional analysis. The Mask R-CNN model was tested on simulated STEM-HAADF results with different Gaussian noises, particle shapes and particle sizes, and the results indicated that Gaussian noise has determining influence on the accuracy of recognition. By applying Gaussian and Non-Local Means filters on the noise-containing STEM-HAADF results, the influences of noises are largely mitigated, and recognition accuracy is significantly improved. This filtering-recognition approach was further applied to experimental STEM-HAADF results, which yields satisfying accuracy compared with the traditional threshold methods. The deep-learning-based method developed in this work has great potentials in analysis of the complicated structures and large data generated by STEM-HAADF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16637v1-abstract-full').style.display = 'none'; document.getElementById('2409.16637v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16431">arXiv:2409.16431</a> <span> [<a href="https://arxiv.org/pdf/2409.16431">pdf</a>, <a href="https://arxiv.org/format/2409.16431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Hand Gesture Classification Based on Forearm Ultrasound Video Snippets Using 3D Convolutional Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bimbraw%2C+K">Keshav Bimbraw</a>, <a href="/search/eess?searchtype=author&query=Talele%2C+A">Ankit Talele</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H+K">Haichong K. Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16431v1-abstract-short" style="display: inline;"> Ultrasound based hand movement estimation is a crucial area of research with applications in human-machine interaction. Forearm ultrasound offers detailed information about muscle morphology changes during hand movement which can be used to estimate hand gestures. Previous work has focused on analyzing 2-Dimensional (2D) ultrasound image frames using techniques such as convolutional neural network… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16431v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16431v1-abstract-full" style="display: none;"> Ultrasound based hand movement estimation is a crucial area of research with applications in human-machine interaction. Forearm ultrasound offers detailed information about muscle morphology changes during hand movement which can be used to estimate hand gestures. Previous work has focused on analyzing 2-Dimensional (2D) ultrasound image frames using techniques such as convolutional neural networks (CNNs). However, such 2D techniques do not capture temporal features from segments of ultrasound data corresponding to continuous hand movements. This study uses 3D CNN based techniques to capture spatio-temporal patterns within ultrasound video segments for gesture recognition. We compared the performance of a 2D convolution-based network with (2+1)D convolution-based, 3D convolution-based, and our proposed network. Our methodology enhanced the gesture classification accuracy to 98.8 +/- 0.9%, from 96.5 +/- 2.3% compared to a network trained with 2D convolution layers. These results demonstrate the advantages of using ultrasound video snippets for improving hand gesture classification performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16431v1-abstract-full').style.display = 'none'; document.getElementById('2409.16431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to IUS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16312">arXiv:2409.16312</a> <span> [<a href="https://arxiv.org/pdf/2409.16312">pdf</a>, <a href="https://arxiv.org/format/2409.16312">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> SEE: Semantically Aligned EEG-to-Text Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Tao%2C+Y">Yitian Tao</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Y">Yan Liang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+L">Luoyu Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yongqing Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Q">Qing Yang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Han Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16312v1-abstract-short" style="display: inline;"> Decoding neurophysiological signals into language is of great research interest within brain-computer interface (BCI) applications. Electroencephalography (EEG), known for its non-invasiveness, ease of use, and cost-effectiveness, has been a popular method in this field. However, current EEG-to-Text decoding approaches face challenges due to the huge domain gap between EEG recordings and raw texts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16312v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16312v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16312v1-abstract-full" style="display: none;"> Decoding neurophysiological signals into language is of great research interest within brain-computer interface (BCI) applications. Electroencephalography (EEG), known for its non-invasiveness, ease of use, and cost-effectiveness, has been a popular method in this field. However, current EEG-to-Text decoding approaches face challenges due to the huge domain gap between EEG recordings and raw texts, inherent data bias, and small closed vocabularies. In this paper, we propose SEE: Semantically Aligned EEG-to-Text Translation, a novel method aimed at improving EEG-to-Text decoding by seamlessly integrating two modules into a pre-trained BART language model. These two modules include (1) a Cross-Modal Codebook that learns cross-modal representations to enhance feature consolidation and mitigate domain gap, and (2) a Semantic Matching Module that fully utilizes pre-trained text representations to align multi-modal features extracted from EEG-Text pairs while considering noise caused by false negatives, i.e., data from different EEG-Text pairs that have similar semantic meanings. Experimental results on the Zurich Cognitive Language Processing Corpus (ZuCo) demonstrate the effectiveness of SEE, which enhances the feasibility of accurate EEG-to-Text decoding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16312v1-abstract-full').style.display = 'none'; document.getElementById('2409.16312v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14685">arXiv:2409.14685</a> <span> [<a href="https://arxiv.org/pdf/2409.14685">pdf</a>, <a href="https://arxiv.org/ps/2409.14685">ps</a>, <a href="https://arxiv.org/format/2409.14685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Near-field Beam Focusing under Discrete Phase Shifters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haodong Zhang</a>, <a href="/search/eess?searchtype=author&query=You%2C+C">Changsheng You</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+C">Cong Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14685v1-abstract-short" style="display: inline;"> Extremely large-scale arrays (XL-arrays) have emerged as a promising technology for enabling near-field communications in future wireless systems. However, the huge number of antennas pose demanding challenges on the hardware cost and energy consumption, especially when the antennas employ high-resolution phase shifters (PSs). To address this issue, in this paper, we consider discrete PSs at the X… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14685v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14685v1-abstract-full" style="display: none;"> Extremely large-scale arrays (XL-arrays) have emerged as a promising technology for enabling near-field communications in future wireless systems. However, the huge number of antennas pose demanding challenges on the hardware cost and energy consumption, especially when the antennas employ high-resolution phase shifters (PSs). To address this issue, in this paper, we consider discrete PSs at the XL-array which are practically more energy efficient, and investigate the impact of PS resolution on the near-field beam-focusing effect. To this end, we propose a new Fourier series expansion method to efficiently tackle the difficulty in characterising the beam pattern properties under phase quantization. Interestingly, we analytically show, for the first time, that 1) discrete PSs introduce additional grating lobes; 2) the main lobe still exhibits the beam-focusing effect with its beam power increasing with PS resolution; and 3) there are two types of grating lobes, featured by the beam-focusing and beam-steering effects, respectively. Finally, numerical results demonstrate that the grating lobes generally degrade the communication performance. However, a low-resolution of 3-bit PSs can achieve similar beam pattern and rate performance with the continuous PS counterpart, while it attains much higher energy efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14685v1-abstract-full').style.display = 'none'; document.getElementById('2409.14685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14022">arXiv:2409.14022</a> <span> [<a href="https://arxiv.org/pdf/2409.14022">pdf</a>, <a href="https://arxiv.org/format/2409.14022">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Universal Modem Generation with Inherent Adaptability to Variant Underwater Acoustic Channels: a Data-Driven Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=You%2C+X">Xiaoquan You</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hengyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+X">Xuehan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jintao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14022v1-abstract-short" style="display: inline;"> In underwater acoustic (UWA) communication, orthogonal frequency division multiplexing (OFDM) is commonly employed to mitigate the inter-symbol interference (ISI) caused by delay spread. However, path-specific Doppler effects in UWA channels could result in significant inter-carrier interference (ICI) in the OFDM system. To address this problem, we introduce a multi-resolution convolutional neural… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14022v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14022v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14022v1-abstract-full" style="display: none;"> In underwater acoustic (UWA) communication, orthogonal frequency division multiplexing (OFDM) is commonly employed to mitigate the inter-symbol interference (ISI) caused by delay spread. However, path-specific Doppler effects in UWA channels could result in significant inter-carrier interference (ICI) in the OFDM system. To address this problem, we introduce a multi-resolution convolutional neural network (CNN) named UWAModNet in this paper, designed to optimize the modem structure, specifically modulation and demodulation matrices. Based on a trade-off between the minimum and the average equivalent sub-channel rate, we propose an optimization criterion suitable to evaluate the performance of our learned modem. Additionally, a two-stage training strategy is developed to achieve quasi-optimal results. Simulations indicate that the learned modem outperforms zero-padded OFDM (ZP-OFDM) in terms of equivalent sub-channel rate and bit error rate, even under more severe Doppler effects during testing compared to training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14022v1-abstract-full').style.display = 'none'; document.getElementById('2409.14022v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 3 figures, submitted to WCSP'24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12139">arXiv:2409.12139</a> <span> [<a href="https://arxiv.org/pdf/2409.12139">pdf</a>, <a href="https://arxiv.org/format/2409.12139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Takin: A Cohort of Superior Quality Zero-shot Speech Generation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+S">Sijing Chen</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+Y">Yuan Feng</a>, <a href="/search/eess?searchtype=author&query=He%2C+L">Laipeng He</a>, <a href="/search/eess?searchtype=author&query=He%2C+T">Tianwei He</a>, <a href="/search/eess?searchtype=author&query=He%2C+W">Wendi He</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yanni Hu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Y">Yiting Lin</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+Y">Yu Pan</a>, <a href="/search/eess?searchtype=author&query=Tan%2C+P">Pengfei Tan</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+C">Chengwei Tian</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+R">Ruoye Xie</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+J">Jixun Yao</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+Q">Quanlei Yan</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Y">Yuguang Yang</a>, <a href="/search/eess?searchtype=author&query=Ye%2C+J">Jianhao Ye</a>, <a href="/search/eess?searchtype=author&query=Yin%2C+J">Jingjing Yin</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Y">Yanzhen Yu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huimin Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+G">Guangcheng Zhao</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/eess?searchtype=author&query=Zou%2C+P">Pengpeng Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12139v3-abstract-short" style="display: inline;"> With the advent of the big data and large language model era, zero-shot personalized rapid customization has emerged as a significant trend. In this report, we introduce Takin AudioLLM, a series of techniques and models, mainly including Takin TTS, Takin VC, and Takin Morphing, specifically designed for audiobook production. These models are capable of zero-shot speech production, generating high-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12139v3-abstract-full').style.display = 'inline'; document.getElementById('2409.12139v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12139v3-abstract-full" style="display: none;"> With the advent of the big data and large language model era, zero-shot personalized rapid customization has emerged as a significant trend. In this report, we introduce Takin AudioLLM, a series of techniques and models, mainly including Takin TTS, Takin VC, and Takin Morphing, specifically designed for audiobook production. These models are capable of zero-shot speech production, generating high-quality speech that is nearly indistinguishable from real human speech and facilitating individuals to customize the speech content according to their own needs. Specifically, we first introduce Takin TTS, a neural codec language model that builds upon an enhanced neural speech codec and a multi-task training framework, capable of generating high-fidelity natural speech in a zero-shot way. For Takin VC, we advocate an effective content and timbre joint modeling approach to improve the speaker similarity, while advocating for a conditional flow matching based decoder to further enhance its naturalness and expressiveness. Last, we propose the Takin Morphing system with highly decoupled and advanced timbre and prosody modeling approaches, which enables individuals to customize speech production with their preferred timbre and prosody in a precise and controllable manner. Extensive experiments validate the effectiveness and robustness of our Takin AudioLLM series models. For detailed demos, please refer to https://everest-ai.github.io/takinaudiollm/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12139v3-abstract-full').style.display = 'none'; document.getElementById('2409.12139v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report; 18 pages; typos corrected, references added, demo url modified, author name modified;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11111">arXiv:2409.11111</a> <span> [<a href="https://arxiv.org/pdf/2409.11111">pdf</a>, <a href="https://arxiv.org/format/2409.11111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Few-Shot Domain Adaptation for Learned Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+T">Tianyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haotian Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuqi Li</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11111v1-abstract-short" style="display: inline;"> Learned image compression (LIC) has achieved state-of-the-art rate-distortion performance, deemed promising for next-generation image compression techniques. However, pre-trained LIC models usually suffer from significant performance degradation when applied to out-of-training-domain images, implying their poor generalization capabilities. To tackle this problem, we propose a few-shot domain adapt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11111v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11111v1-abstract-full" style="display: none;"> Learned image compression (LIC) has achieved state-of-the-art rate-distortion performance, deemed promising for next-generation image compression techniques. However, pre-trained LIC models usually suffer from significant performance degradation when applied to out-of-training-domain images, implying their poor generalization capabilities. To tackle this problem, we propose a few-shot domain adaptation method for LIC by integrating plug-and-play adapters into pre-trained models. Drawing inspiration from the analogy between latent channels and frequency components, we examine domain gaps in LIC and observe that out-of-training-domain images disrupt pre-trained channel-wise decomposition. Consequently, we introduce a method for channel-wise re-allocation using convolution-based adapters and low-rank adapters, which are lightweight and compatible to mainstream LIC schemes. Extensive experiments across multiple domains and multiple representative LIC schemes demonstrate that our method significantly enhances pre-trained models, achieving comparable performance to H.266/VVC intra coding with merely 25 target-domain samples. Additionally, our method matches the performance of full-model finetune while transmitting fewer than $2\%$ of the parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11111v1-abstract-full').style.display = 'none'; document.getElementById('2409.11111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10900">arXiv:2409.10900</a> <span> [<a href="https://arxiv.org/pdf/2409.10900">pdf</a>, <a href="https://arxiv.org/format/2409.10900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Channel Correlation Matrix Extrapolation Based on Roughness Calibration of Scatterers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Heling Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiujun Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhong%2C+X">Xiaofeng Zhong</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shidong Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10900v2-abstract-short" style="display: inline;"> To estimate the channel correlation matrix (CCM) in areas where channel information cannot be collected in advance, this paper proposes a way to spatially extrapolate CCM based on the calibration of the surface roughness parameters of scatterers in the propagation scene. We calibrate the roughness parameters of scene scatters based on CCM data in some specific areas. From these calibrated roughnes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10900v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10900v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10900v2-abstract-full" style="display: none;"> To estimate the channel correlation matrix (CCM) in areas where channel information cannot be collected in advance, this paper proposes a way to spatially extrapolate CCM based on the calibration of the surface roughness parameters of scatterers in the propagation scene. We calibrate the roughness parameters of scene scatters based on CCM data in some specific areas. From these calibrated roughness parameters, we are able to generate a good prediction of the CCM for any other area in the scene by performing ray tracing. Simulation results show that the channel extrapolation method proposed in this paper can effectively realize the extrapolation of the CCM between different areas in frequency domain, or even from one domain to another. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10900v2-abstract-full').style.display = 'none'; document.getElementById('2409.10900v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures,2024 IEEE 24th International Conference on Communication Technology (ICCT 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10819">arXiv:2409.10819</a> <span> [<a href="https://arxiv.org/pdf/2409.10819">pdf</a>, <a href="https://arxiv.org/format/2409.10819">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> EzAudio: Enhancing Text-to-Audio Generation with Efficient Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hai%2C+J">Jiarui Hai</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+Y">Yong Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chenxing Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Helin Wang</a>, <a href="/search/eess?searchtype=author&query=Elhilali%2C+M">Mounya Elhilali</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+D">Dong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10819v1-abstract-short" style="display: inline;"> Latent diffusion models have shown promising results in text-to-audio (T2A) generation tasks, yet previous models have encountered difficulties in generation quality, computational cost, diffusion sampling, and data preparation. In this paper, we introduce EzAudio, a transformer-based T2A diffusion model, to handle these challenges. Our approach includes several key innovations: (1) We build the T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10819v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10819v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10819v1-abstract-full" style="display: none;"> Latent diffusion models have shown promising results in text-to-audio (T2A) generation tasks, yet previous models have encountered difficulties in generation quality, computational cost, diffusion sampling, and data preparation. In this paper, we introduce EzAudio, a transformer-based T2A diffusion model, to handle these challenges. Our approach includes several key innovations: (1) We build the T2A model on the latent space of a 1D waveform Variational Autoencoder (VAE), avoiding the complexities of handling 2D spectrogram representations and using an additional neural vocoder. (2) We design an optimized diffusion transformer architecture specifically tailored for audio latent representations and diffusion modeling, which enhances convergence speed, training stability, and memory usage, making the training process easier and more efficient. (3) To tackle data scarcity, we adopt a data-efficient training strategy that leverages unlabeled data for learning acoustic dependencies, audio caption data annotated by audio-language models for text-to-audio alignment learning, and human-labeled data for fine-tuning. (4) We introduce a classifier-free guidance (CFG) rescaling method that simplifies EzAudio by achieving strong prompt alignment while preserving great audio quality when using larger CFG scores, eliminating the need to struggle with finding the optimal CFG score to balance this trade-off. EzAudio surpasses existing open-source models in both objective metrics and subjective evaluations, delivering realistic listening experiences while maintaining a streamlined model structure, low training costs, and an easy-to-follow training pipeline. Code, data, and pre-trained models are released at: https://haidog-yaqub.github.io/EzAudio-Page/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10819v1-abstract-full').style.display = 'none'; document.getElementById('2409.10819v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">submitted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10157">arXiv:2409.10157</a> <span> [<a href="https://arxiv.org/pdf/2409.10157">pdf</a>, <a href="https://arxiv.org/format/2409.10157">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Emo-DPO: Controllable Emotional Speech Synthesis through Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gao%2C+X">Xiaoxue Gao</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yiming Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huayun Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+N+F">Nancy F. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10157v1-abstract-short" style="display: inline;"> Current emotional text-to-speech (TTS) models predominantly conduct supervised training to learn the conversion from text and desired emotion to its emotional speech, focusing on a single emotion per text-speech pair. These models only learn the correct emotional outputs without fully comprehending other emotion characteristics, which limits their capabilities of capturing the nuances between diff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10157v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10157v1-abstract-full" style="display: none;"> Current emotional text-to-speech (TTS) models predominantly conduct supervised training to learn the conversion from text and desired emotion to its emotional speech, focusing on a single emotion per text-speech pair. These models only learn the correct emotional outputs without fully comprehending other emotion characteristics, which limits their capabilities of capturing the nuances between different emotions. We propose a controllable Emo-DPO approach, which employs direct preference optimization to differentiate subtle emotional nuances between emotions through optimizing towards preferred emotions over less preferred emotional ones. Instead of relying on traditional neural architectures used in existing emotional TTS models, we propose utilizing the emotion-aware LLM-TTS neural architecture to leverage LLMs' in-context learning and instruction-following capabilities. Comprehensive experiments confirm that our proposed method outperforms the existing baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10157v1-abstract-full').style.display = 'none'; document.getElementById('2409.10157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08795">arXiv:2409.08795</a> <span> [<a href="https://arxiv.org/pdf/2409.08795">pdf</a>, <a href="https://arxiv.org/format/2409.08795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> LLaQo: Towards a Query-Based Coach in Expressive Music Performance Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheung%2C+V">Vincent Cheung</a>, <a href="/search/eess?searchtype=author&query=Nishioka%2C+H">Hayato Nishioka</a>, <a href="/search/eess?searchtype=author&query=Dixon%2C+S">Simon Dixon</a>, <a href="/search/eess?searchtype=author&query=Furuya%2C+S">Shinichi Furuya</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08795v2-abstract-short" style="display: inline;"> Research in music understanding has extensively explored composition-level attributes such as key, genre, and instrumentation through advanced representations, leading to cross-modal applications using large language models. However, aspects of musical performance such as stylistic expression and technique remain underexplored, along with the potential of using large language models to enhance edu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08795v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08795v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08795v2-abstract-full" style="display: none;"> Research in music understanding has extensively explored composition-level attributes such as key, genre, and instrumentation through advanced representations, leading to cross-modal applications using large language models. However, aspects of musical performance such as stylistic expression and technique remain underexplored, along with the potential of using large language models to enhance educational outcomes with customized feedback. To bridge this gap, we introduce LLaQo, a Large Language Query-based music coach that leverages audio language modeling to provide detailed and formative assessments of music performances. We also introduce instruction-tuned query-response datasets that cover a variety of performance dimensions from pitch accuracy to articulation, as well as contextual performance understanding (such as difficulty and performance techniques). Utilizing AudioMAE encoder and Vicuna-7b LLM backend, our model achieved state-of-the-art (SOTA) results in predicting teachers' performance ratings, as well as in identifying piece difficulty and playing techniques. Textual responses from LLaQo was moreover rated significantly higher compared to other baseline models in a user study using audio-text matching. Our proposed model can thus provide informative answers to open-ended questions related to musical performance from audio data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08795v2-abstract-full').style.display = 'none'; document.getElementById('2409.08795v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08481">arXiv:2409.08481</a> <span> [<a href="https://arxiv.org/pdf/2409.08481">pdf</a>, <a href="https://arxiv.org/format/2409.08481">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> USTC-TD: A Test Dataset and Benchmark for Image and Video Coding in 2020s </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zhuoyuan Li</a>, <a href="/search/eess?searchtype=author&query=Liao%2C+J">Junqi Liao</a>, <a href="/search/eess?searchtype=author&query=Tang%2C+C">Chuanbo Tang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haotian Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yuqi Li</a>, <a href="/search/eess?searchtype=author&query=Bian%2C+Y">Yifan Bian</a>, <a href="/search/eess?searchtype=author&query=Sheng%2C+X">Xihua Sheng</a>, <a href="/search/eess?searchtype=author&query=Feng%2C+X">Xinmin Feng</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yao Li</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+C">Changsheng Gao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Li Li</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+F">Feng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08481v2-abstract-short" style="display: inline;"> Image/video coding has been a remarkable research area for both academia and industry for many years. Testing datasets, especially high-quality image/video datasets are desirable for the justified evaluation of coding-related research, practical applications, and standardization activities. We put forward a test dataset namely USTC-TD, which has been successfully adopted in the practical end-to-en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08481v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08481v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08481v2-abstract-full" style="display: none;"> Image/video coding has been a remarkable research area for both academia and industry for many years. Testing datasets, especially high-quality image/video datasets are desirable for the justified evaluation of coding-related research, practical applications, and standardization activities. We put forward a test dataset namely USTC-TD, which has been successfully adopted in the practical end-to-end image/video coding challenge of the IEEE International Conference on Visual Communications and lmage Processing (VCIP) in 2022 and 2023. USTC-TD contains 40 images at 4K spatial resolution and 10 video sequences at 1080p spatial resolution, featuring various content due to the diverse environmental factors (e.g. scene type, texture, motion, view) and the designed imaging factors (e.g. illumination, lens, shadow). We quantitatively evaluate USTC-TD on different image/video features (spatial, temporal, color, lightness), and compare it with the previous image/video test datasets, which verifies the wider coverage and more diversity of the proposed dataset. We also evaluate both classic standardized and recent learned image/video coding schemes on USTC-TD with PSNR and MS-SSIM, and provide an extensive benchmark for the evaluated schemes. Based on the characteristics and specific design of the proposed test dataset, we analyze the benchmark performance and shed light on the future research and development of image/video coding. All the data are released online: https://esakak.github.io/USTC-TD . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08481v2-abstract-full').style.display = 'none'; document.getElementById('2409.08481v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages. Project Page: https://esakak.github.io/USTC-TD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08155">arXiv:2409.08155</a> <span> [<a href="https://arxiv.org/pdf/2409.08155">pdf</a>, <a href="https://arxiv.org/format/2409.08155">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Hierarchical Symbolic Pop Music Generation with Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lim%2C+W+Q">Wen Qing Lim</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+J">Jinhua Liang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08155v1-abstract-short" style="display: inline;"> Music is inherently made up of complex structures, and representing them as graphs helps to capture multiple levels of relationships. While music generation has been explored using various deep generation techniques, research on graph-related music generation is sparse. Earlier graph-based music generation worked only on generating melodies, and recent works to generate polyphonic music do not acc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08155v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08155v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08155v1-abstract-full" style="display: none;"> Music is inherently made up of complex structures, and representing them as graphs helps to capture multiple levels of relationships. While music generation has been explored using various deep generation techniques, research on graph-related music generation is sparse. Earlier graph-based music generation worked only on generating melodies, and recent works to generate polyphonic music do not account for longer-term structure. In this paper, we explore a multi-graph approach to represent both the rhythmic patterns and phrase structure of Chinese pop music. Consequently, we propose a two-step approach that aims to generate polyphonic music with coherent rhythm and long-term structure. We train two Variational Auto-Encoder networks - one on a MIDI dataset to generate 4-bar phrases, and another on song structure labels to generate full song structure. Our work shows that the models are able to learn most of the structural nuances in the training dataset, including chord and pitch frequency distributions, and phrase attributes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08155v1-abstract-full').style.display = 'none'; document.getElementById('2409.08155v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07827">arXiv:2409.07827</a> <span> [<a href="https://arxiv.org/pdf/2409.07827">pdf</a>, <a href="https://arxiv.org/format/2409.07827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Bridging Paintings and Music -- Exploring Emotion based Music Generation through Paintings </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hisariya%2C+T">Tanisha Hisariya</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huan Zhang</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+J">Jinhua Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07827v1-abstract-short" style="display: inline;"> Rapid advancements in artificial intelligence have significantly enhanced generative tasks involving music and images, employing both unimodal and multimodal approaches. This research develops a model capable of generating music that resonates with the emotions depicted in visual arts, integrating emotion labeling, image captioning, and language models to transform visual inputs into musical compo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07827v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07827v1-abstract-full" style="display: none;"> Rapid advancements in artificial intelligence have significantly enhanced generative tasks involving music and images, employing both unimodal and multimodal approaches. This research develops a model capable of generating music that resonates with the emotions depicted in visual arts, integrating emotion labeling, image captioning, and language models to transform visual inputs into musical compositions. Addressing the scarcity of aligned art and music data, we curated the Emotion Painting Music Dataset, pairing paintings with corresponding music for effective training and evaluation. Our dual-stage framework converts images to text descriptions of emotional content and then transforms these descriptions into music, facilitating efficient learning with minimal data. Performance is evaluated using metrics such as Fr茅chet Audio Distance (FAD), Total Harmonic Distortion (THD), Inception Score (IS), and KL divergence, with audio-emotion text similarity confirmed by the pre-trained CLAP model to demonstrate high alignment between generated music and text. This synthesis tool bridges visual art and music, enhancing accessibility for the visually impaired and opening avenues in educational and therapeutic applications by providing enriched multi-sensory experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07827v1-abstract-full').style.display = 'none'; document.getElementById('2409.07827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.07417">arXiv:2409.07417</a> <span> [<a href="https://arxiv.org/pdf/2409.07417">pdf</a>, <a href="https://arxiv.org/format/2409.07417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient One-Step Diffusion Refinement for Snapshot Compressive Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yunzhen Wang</a>, <a href="/search/eess?searchtype=author&query=Zeng%2C+H">Haijin Zeng</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+S">Shaoguang Huang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hongyu Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hongyan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.07417v1-abstract-short" style="display: inline;"> Coded Aperture Snapshot Spectral Imaging (CASSI) is a crucial technique for capturing three-dimensional multispectral images (MSIs) through the complex inverse task of reconstructing these images from coded two-dimensional measurements. Current state-of-the-art methods, predominantly end-to-end, face limitations in reconstructing high-frequency details and often rely on constrained datasets like K… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07417v1-abstract-full').style.display = 'inline'; document.getElementById('2409.07417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.07417v1-abstract-full" style="display: none;"> Coded Aperture Snapshot Spectral Imaging (CASSI) is a crucial technique for capturing three-dimensional multispectral images (MSIs) through the complex inverse task of reconstructing these images from coded two-dimensional measurements. Current state-of-the-art methods, predominantly end-to-end, face limitations in reconstructing high-frequency details and often rely on constrained datasets like KAIST and CAVE, resulting in models with poor generalizability. In response to these challenges, this paper introduces a novel one-step Diffusion Probabilistic Model within a self-supervised adaptation framework for Snapshot Compressive Imaging (SCI). Our approach leverages a pretrained SCI reconstruction network to generate initial predictions from two-dimensional measurements. Subsequently, a one-step diffusion model produces high-frequency residuals to enhance these initial predictions. Additionally, acknowledging the high costs associated with collecting MSIs, we develop a self-supervised paradigm based on the Equivariant Imaging (EI) framework. Experimental results validate the superiority of our model compared to previous methods, showcasing its simplicity and adaptability to various end-to-end or unfolding techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.07417v1-abstract-full').style.display = 'none'; document.getElementById('2409.07417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06610">arXiv:2409.06610</a> <span> [<a href="https://arxiv.org/pdf/2409.06610">pdf</a>, <a href="https://arxiv.org/ps/2409.06610">ps</a>, <a href="https://arxiv.org/format/2409.06610">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Bayesian hypergame approach to equilibrium stability and robustness in moving target defense </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hanzheng Zhang</a>, <a href="/search/eess?searchtype=author&query=Cheng%2C+Z">Zhaoyang Cheng</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+G">Guanpu Chen</a>, <a href="/search/eess?searchtype=author&query=Johansson%2C+K+H">Karl Henrik Johansson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06610v1-abstract-short" style="display: inline;"> We investigate the equilibrium stability and robustness in a class of moving target defense problems, in which players have both incomplete information and asymmetric cognition. We first establish a Bayesian Stackelberg game model for incomplete information and then employ a hypergame reformulation to address asymmetric cognition. With the core concept of the hyper Bayesian Nash equilibrium (HBNE)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06610v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06610v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06610v1-abstract-full" style="display: none;"> We investigate the equilibrium stability and robustness in a class of moving target defense problems, in which players have both incomplete information and asymmetric cognition. We first establish a Bayesian Stackelberg game model for incomplete information and then employ a hypergame reformulation to address asymmetric cognition. With the core concept of the hyper Bayesian Nash equilibrium (HBNE), a condition for achieving both the strategic and cognitive stability in equilibria can be realized by solving linear equations. Moreover, to deal with players' underlying perturbed knowledge, we study the equilibrium robustness by presenting a condition of robust HBNE under the given configuration. Experiments evaluate our theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06610v1-abstract-full').style.display = 'none'; document.getElementById('2409.06610v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03269">arXiv:2409.03269</a> <span> [<a href="https://arxiv.org/pdf/2409.03269">pdf</a>, <a href="https://arxiv.org/format/2409.03269">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A spherical harmonic-domain spatial audio signal enhancement method based on minimum variance distortionless response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Huawei Zhang</a>, <a href="/search/eess?searchtype=author&query=Jihui"> Jihui</a>, <a href="/search/eess?searchtype=author&query=Zhang"> Zhang</a>, <a href="/search/eess?searchtype=author&query=Huiyuan"> Huiyuan</a>, <a href="/search/eess?searchtype=author&query=Sun"> Sun</a>, <a href="/search/eess?searchtype=author&query=Samarasinghe%2C+P">Prasanga Samarasinghe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03269v1-abstract-short" style="display: inline;"> Spatial audio signal enhancement aims to reduce interfering source contributions while preserving the desired sound field with its spatial cues intact. Existing methods generally rely on impractical assumptions (e.g. no reverberation or accurate estimations of impractical information) or have limited applicability. This paper presents a spherical harmonic (SH)-domain minimum variance distortionles… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03269v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03269v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03269v1-abstract-full" style="display: none;"> Spatial audio signal enhancement aims to reduce interfering source contributions while preserving the desired sound field with its spatial cues intact. Existing methods generally rely on impractical assumptions (e.g. no reverberation or accurate estimations of impractical information) or have limited applicability. This paper presents a spherical harmonic (SH)-domain minimum variance distortionless response (MVDR)-based spatial signal enhancer using Relative Harmonic Coefficients (ReHCs) to extract clean SH coefficients from noisy ones in reverberant environments. A simulation study shows the proposed method achieves lower estimation error, higher speech-distortion-ratio (SDR), and comparable noise reduction (NR) within the sweet area in a reverberant environment, compared to a beamforming-and-projection method as the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03269v1-abstract-full').style.display = 'none'; document.getElementById('2409.03269v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03251">arXiv:2409.03251</a> <span> [<a href="https://arxiv.org/pdf/2409.03251">pdf</a>, <a href="https://arxiv.org/format/2409.03251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Dual-TSST: A Dual-Branch Temporal-Spectral-Spatial Transformer Model for EEG Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+H">Hongqi Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haodong Zhang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yitong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03251v1-abstract-short" style="display: inline;"> The decoding of electroencephalography (EEG) signals allows access to user intentions conveniently, which plays an important role in the fields of human-machine interaction. To effectively extract sufficient characteristics of the multichannel EEG, a novel decoding architecture network with a dual-branch temporal-spectral-spatial transformer (Dual-TSST) is proposed in this study. Specifically, by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03251v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03251v1-abstract-full" style="display: none;"> The decoding of electroencephalography (EEG) signals allows access to user intentions conveniently, which plays an important role in the fields of human-machine interaction. To effectively extract sufficient characteristics of the multichannel EEG, a novel decoding architecture network with a dual-branch temporal-spectral-spatial transformer (Dual-TSST) is proposed in this study. Specifically, by utilizing convolutional neural networks (CNNs) on different branches, the proposed processing network first extracts the temporal-spatial features of the original EEG and the temporal-spectral-spatial features of time-frequency domain data converted by wavelet transformation, respectively. These perceived features are then integrated by a feature fusion block, serving as the input of the transformer to capture the global long-range dependencies entailed in the non-stationary EEG, and being classified via the global average pooling and multi-layer perceptron blocks. To evaluate the efficacy of the proposed approach, the competitive experiments are conducted on three publicly available datasets of BCI IV 2a, BCI IV 2b, and SEED, with the head-to-head comparison of more than ten other state-of-the-art methods. As a result, our proposed Dual-TSST performs superiorly in various tasks, which achieves the promising EEG classification performance of average accuracy of 80.67% in BCI IV 2a, 88.64% in BCI IV 2b, and 96.65% in SEED, respectively. Extensive ablation experiments conducted between the Dual-TSST and comparative baseline model also reveal the enhanced decoding performance with each module of our proposed method. This study provides a new approach to high-performance EEG decoding, and has great potential for future CNN-Transformer based applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03251v1-abstract-full').style.display = 'none'; document.getElementById('2409.03251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01957">arXiv:2409.01957</a> <span> [<a href="https://arxiv.org/pdf/2409.01957">pdf</a>, <a href="https://arxiv.org/ps/2409.01957">ps</a>, <a href="https://arxiv.org/format/2409.01957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Power Control and Random Serving Mode Allocation for CJT-NCJT Hybrid Mode Enabled Cell-Free Massive MIMO With Limited Fronthauls </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hangyu Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yongzhao Li</a>, <a href="/search/eess?searchtype=author&query=Ruan%2C+Y">Yuhan Ruan</a>, <a href="/search/eess?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+D">Dong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01957v1-abstract-short" style="display: inline;"> With a great potential of improving the service fairness and quality for user equipments (UEs), cell-free massive multiple-input multiple-output (mMIMO) has been regarded as an emerging candidate for 6G network architectures. Under ideal assumptions, the coherent joint transmission (CJT) serving mode has been considered as an optimal option for cell-free mMIMO systems, since it can achieve coheren… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01957v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01957v1-abstract-full" style="display: none;"> With a great potential of improving the service fairness and quality for user equipments (UEs), cell-free massive multiple-input multiple-output (mMIMO) has been regarded as an emerging candidate for 6G network architectures. Under ideal assumptions, the coherent joint transmission (CJT) serving mode has been considered as an optimal option for cell-free mMIMO systems, since it can achieve coherent cooperation gain among the access points. However, when considering the limited fronthaul constraint in practice, the non-coherent joint transmission (NCJT) serving mode is likely to outperform CJT, since the former requires much lower fronthaul resources. In other words, the performance excellence and worseness of single serving mode (CJT or NCJT) depends on the fronthaul capacity, and any single transmission mode cannot perfectly adapt the capacity limited fronthaul. To explore the performance potential of the cell-free mMIMO system with limited fronthauls by harnessing the merits of CJT and NCJT, we propose a CJT-NCJT hybrid serving mode framework, in which UEs are allocated to operate on CJT or NCJT serving mode. To improve the sum-rate of the system with low complexity, we first propose a probability-based random serving mode allocation scheme. With a given serving mode, a successive convex approximation-based power allocation algorithm is proposed to maximize the system's sum-rate. Simulation results demonstrate the superiority of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01957v1-abstract-full').style.display = 'none'; document.getElementById('2409.01957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 2 figures, accepted by GLOBECOM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00276">arXiv:2409.00276</a> <span> [<a href="https://arxiv.org/pdf/2409.00276">pdf</a>, <a href="https://arxiv.org/format/2409.00276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Exact Recovery Guarantees for Parameterized Non-linear System Identification Problem under Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haixiang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yalcin%2C+B">Baturalp Yalcin</a>, <a href="/search/eess?searchtype=author&query=Lavaei%2C+J">Javad Lavaei</a>, <a href="/search/eess?searchtype=author&query=Sontag%2C+E+D">Eduardo D. Sontag</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00276v2-abstract-short" style="display: inline;"> In this work, we study the system identification problem for parameterized non-linear systems using basis functions under adversarial attacks. Motivated by the LASSO-type estimators, we analyze the exact recovery property of a non-smooth estimator, which is generated by solving an embedded $\ell_1$-loss minimization problem. First, we derive necessary and sufficient conditions for the well-specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00276v2-abstract-full').style.display = 'inline'; document.getElementById('2409.00276v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00276v2-abstract-full" style="display: none;"> In this work, we study the system identification problem for parameterized non-linear systems using basis functions under adversarial attacks. Motivated by the LASSO-type estimators, we analyze the exact recovery property of a non-smooth estimator, which is generated by solving an embedded $\ell_1$-loss minimization problem. First, we derive necessary and sufficient conditions for the well-specifiedness of the estimator and the uniqueness of global solutions to the underlying optimization problem. Next, we provide exact recovery guarantees for the estimator under two different scenarios of boundedness and Lipschitz continuity of the basis functions. The non-asymptotic exact recovery is guaranteed with high probability, even when there are more severely corrupted data than clean data. Finally, we numerically illustrate the validity of our theory. This is the first study on the sample complexity analysis of a non-smooth estimator for the non-linear system identification problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00276v2-abstract-full').style.display = 'none'; document.getElementById('2409.00276v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62; 90; 93 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>