Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 113 results for author: <span class="mathjax">Xue, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/eess" aria-role="search"> Searching in archive <strong>eess</strong>. <a href="/search/?searchtype=author&query=Xue%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Xue, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Xue%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Xue, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xue%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xue%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06939">arXiv:2502.06939</a> <span> [<a href="https://arxiv.org/pdf/2502.06939">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalizable automated ischaemic stroke lesion segmentation with vision transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Foulon%2C+C">Chris Foulon</a>, <a href="/search/eess?searchtype=author&query=Gray%2C+R">Robert Gray</a>, <a href="/search/eess?searchtype=author&query=Ruffle%2C+J+K">James K. Ruffle</a>, <a href="/search/eess?searchtype=author&query=Best%2C+J">Jonathan Best</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianbo Xu</a>, <a href="/search/eess?searchtype=author&query=Watkins%2C+H">Henry Watkins</a>, <a href="/search/eess?searchtype=author&query=Rondina%2C+J">Jane Rondina</a>, <a href="/search/eess?searchtype=author&query=Pombo%2C+G">Guilherme Pombo</a>, <a href="/search/eess?searchtype=author&query=Giles%2C+D">Dominic Giles</a>, <a href="/search/eess?searchtype=author&query=Wright%2C+P">Paul Wright</a>, <a href="/search/eess?searchtype=author&query=Ovando-Tellez%2C+M">Marcela Ovando-Tellez</a>, <a href="/search/eess?searchtype=author&query=J%C3%A4ger%2C+H+R">H. Rolf J盲ger</a>, <a href="/search/eess?searchtype=author&query=Cardoso%2C+J">Jorge Cardoso</a>, <a href="/search/eess?searchtype=author&query=Ourselin%2C+S">Sebastien Ourselin</a>, <a href="/search/eess?searchtype=author&query=Rees%2C+G">Geraint Rees</a>, <a href="/search/eess?searchtype=author&query=Nachev%2C+P">Parashkev Nachev</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06939v1-abstract-short" style="display: inline;"> Ischaemic stroke, a leading cause of death and disability, critically relies on neuroimaging for characterising the anatomical pattern of injury. Diffusion-weighted imaging (DWI) provides the highest expressivity in ischemic stroke but poses substantial challenges for automated lesion segmentation: susceptibility artefacts, morphological heterogeneity, age-related comorbidities, time-dependent sig… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06939v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06939v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06939v1-abstract-full" style="display: none;"> Ischaemic stroke, a leading cause of death and disability, critically relies on neuroimaging for characterising the anatomical pattern of injury. Diffusion-weighted imaging (DWI) provides the highest expressivity in ischemic stroke but poses substantial challenges for automated lesion segmentation: susceptibility artefacts, morphological heterogeneity, age-related comorbidities, time-dependent signal dynamics, instrumental variability, and limited labelled data. Current U-Net-based models therefore underperform, a problem accentuated by inadequate evaluation metrics that focus on mean performance, neglecting anatomical, subpopulation, and acquisition-dependent variability. Here, we present a high-performance DWI lesion segmentation tool addressing these challenges through optimized vision transformer-based architectures, integration of 3563 annotated lesions from multi-site data, and algorithmic enhancements, achieving state-of-the-art results. We further propose a novel evaluative framework assessing model fidelity, equity (across demographics and lesion subtypes), anatomical precision, and robustness to instrumental variability, promoting clinical and research utility. This work advances stroke imaging by reconciling model expressivity with domain-specific challenges and redefining performance benchmarks to prioritize equity and generalizability, critical for personalized medicine and mechanistic research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06939v1-abstract-full').style.display = 'none'; document.getElementById('2502.06939v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 7 figures, 2 tables, 1 supplementary table, 2 supplementary figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06171">arXiv:2502.06171</a> <span> [<a href="https://arxiv.org/pdf/2502.06171">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Data-Efficient Pan-Tumor Foundation Model for Oncology CT Interpretation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Lei%2C+W">Wenhui Lei</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Hanyu Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zitian Zhang</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+L">Luyang Luo</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+Q">Qiong Xiao</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+Y">Yannian Gu</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/eess?searchtype=author&query=Jiang%2C+Y">Yankai Jiang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Ci Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+G">Guangtao Wu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongjia Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yingjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xiaofan Zhang</a>, <a href="/search/eess?searchtype=author&query=Rajpurkar%2C+P">Pranav Rajpurkar</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+S">Shaoting Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhenning Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06171v1-abstract-short" style="display: inline;"> Artificial intelligence-assisted imaging analysis has made substantial strides in tumor diagnosis and management. Here we present PASTA, a pan-tumor CT foundation model that achieves state-of-the-art performance on 45 of 46 representative oncology tasks -- including lesion segmentation, tumor detection in plain CT, tumor staging, survival prediction, structured report generation, and cross-modalit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06171v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06171v1-abstract-full" style="display: none;"> Artificial intelligence-assisted imaging analysis has made substantial strides in tumor diagnosis and management. Here we present PASTA, a pan-tumor CT foundation model that achieves state-of-the-art performance on 45 of 46 representative oncology tasks -- including lesion segmentation, tumor detection in plain CT, tumor staging, survival prediction, structured report generation, and cross-modality transfer learning, significantly outperforming the second-best models on 35 tasks. This remarkable advancement is driven by our development of PASTA-Gen, an innovative synthetic tumor generation framework that produces a comprehensive dataset of 30,000 CT scans with pixel-level annotated lesions and paired structured reports, encompassing malignancies across ten organs and five benign lesion types. By leveraging this rich, high-quality synthetic data, we overcome a longstanding bottleneck in the development of CT foundation models -- specifically, the scarcity of publicly available, high-quality annotated datasets due to privacy constraints and the substantial labor required for scaling precise data annotation. Encouragingly, PASTA demonstrates exceptional data efficiency with promising practical value, markedly improving performance on various tasks with only a small amount of real-world data. The open release of both the synthetic dataset and PASTA foundation model effectively addresses the challenge of data scarcity, thereby advancing oncological research and clinical translation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06171v1-abstract-full').style.display = 'none'; document.getElementById('2502.06171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">57 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18418">arXiv:2501.18418</a> <span> [<a href="https://arxiv.org/pdf/2501.18418">pdf</a>, <a href="https://arxiv.org/format/2501.18418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Task-based Regularization in Penalized Least-Squares for Binary Signal Detection Tasks in Medical Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+W">Wentao Chen</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianming Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+W">Weimin Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18418v2-abstract-short" style="display: inline;"> Image denoising algorithms have been extensively investigated for medical imaging. To perform image denoising, penalized least-squares (PLS) problems can be designed and solved, in which the penalty term encodes prior knowledge of the object being imaged. Sparsity-promoting penalties, such as total variation (TV), have been a popular choice for regularizing image denoising problems. However, such… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18418v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18418v2-abstract-full" style="display: none;"> Image denoising algorithms have been extensively investigated for medical imaging. To perform image denoising, penalized least-squares (PLS) problems can be designed and solved, in which the penalty term encodes prior knowledge of the object being imaged. Sparsity-promoting penalties, such as total variation (TV), have been a popular choice for regularizing image denoising problems. However, such hand-crafted penalties may not be able to preserve task-relevant information in measured image data and can lead to oversmoothed image appearances and patchy artifacts that degrade signal detectability. Supervised learning methods that employ convolutional neural networks (CNNs) have emerged as a popular approach to denoising medical images. However, studies have shown that CNNs trained with loss functions based on traditional image quality measures can lead to a loss of task-relevant information in images. Some previous works have investigated task-based loss functions that employ model observers for training the CNN denoising models. However, such training processes typically require a large number of noisy and ground-truth (noise-free or low-noise) image data pairs. In this work, we propose a task-based regularization strategy for use with PLS in medical image denoising. The proposed task-based regularization is associated with the likelihood of linear test statistics of noisy images for Gaussian noise models. The proposed method does not require ground-truth image data and solves an individual optimization problem for denoising each image. Computer-simulation studies are conducted that consider a multivariate-normally distributed (MVN) lumpy background and a binary texture background. It is demonstrated that the proposed regularization strategy can effectively improve signal detectability in denoised images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18418v2-abstract-full').style.display = 'none'; document.getElementById('2501.18418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">SPIE Medical Imaging 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15743">arXiv:2501.15743</a> <span> [<a href="https://arxiv.org/pdf/2501.15743">pdf</a>, <a href="https://arxiv.org/format/2501.15743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Z-Stack Scanning can Improve AI Detection of Mitosis: A Case Study of Meningiomas </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Gu%2C+H">Hongyan Gu</a>, <a href="/search/eess?searchtype=author&query=Onstott%2C+E">Ellie Onstott</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+W">Wenzhong Yan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tengyou Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+R">Ruolin Wang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Z">Zida Wu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X+%27">Xiang 'Anthony' Chen</a>, <a href="/search/eess?searchtype=author&query=Haeri%2C+M">Mohammad Haeri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15743v1-abstract-short" style="display: inline;"> Z-stack scanning is an emerging whole slide imaging technology that captures multiple focal planes alongside the z-axis of a glass slide. Because z-stacking can offer enhanced depth information compared to the single-layer whole slide imaging, this technology can be particularly useful in analyzing small-scaled histopathological patterns. However, its actual clinical impact remains debated with mi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15743v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15743v1-abstract-full" style="display: none;"> Z-stack scanning is an emerging whole slide imaging technology that captures multiple focal planes alongside the z-axis of a glass slide. Because z-stacking can offer enhanced depth information compared to the single-layer whole slide imaging, this technology can be particularly useful in analyzing small-scaled histopathological patterns. However, its actual clinical impact remains debated with mixed results. To clarify this, we investigate the effect of z-stack scanning on artificial intelligence (AI) mitosis detection of meningiomas. With the same set of 22 Hematoxylin and Eosin meningioma glass slides scanned by three different digital pathology scanners, we tested the performance of three AI pipelines on both single-layer and z-stacked whole slide images (WSIs). Results showed that in all scanner-AI combinations, z-stacked WSIs significantly increased AI's sensitivity (+17.14%) on the mitosis detection with only a marginal impact on precision. Our findings provide quantitative evidence that highlights z-stack scanning as a promising technique for AI mitosis detection, paving the way for more reliable AI-assisted pathology workflows, which can ultimately benefit patient management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15743v1-abstract-full').style.display = 'none'; document.getElementById('2501.15743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear 2025 IEEE 22nd International Symposium on Biomedical Imaging (ISBI)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13306">arXiv:2501.13306</a> <span> [<a href="https://arxiv.org/pdf/2501.13306">pdf</a>, <a href="https://arxiv.org/format/2501.13306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> OSUM: Advancing Open Speech Understanding Models with Limited Resources in Academia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Geng%2C+X">Xuelong Geng</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+Q">Qijie Shao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shuiyun Liu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zhennan Lin</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+Z">Zhixian Zhao</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guojian Li</a>, <a href="/search/eess?searchtype=author&query=Tian%2C+W">Wenjie Tian</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+P">Peikun Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shuiyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Cao%2C+Y">Yuang Cao</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chengyou Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Y">Yuhang Dai</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yue Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13306v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made significant progress in various downstream tasks, inspiring the development of Speech Understanding Language Models (SULMs) to enable comprehensive speech-based interactions. However, most advanced SULMs are developed by the industry, leveraging large-scale datasets and computational resources that are not readily available to the academic community. Moreover… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13306v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13306v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13306v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have made significant progress in various downstream tasks, inspiring the development of Speech Understanding Language Models (SULMs) to enable comprehensive speech-based interactions. However, most advanced SULMs are developed by the industry, leveraging large-scale datasets and computational resources that are not readily available to the academic community. Moreover, the lack of transparency in training details creates additional barriers to further innovation. In this study, we present OSUM, an Open Speech Understanding Model designed to explore the potential of training SLUMs under constrained academic resources. The OSUM model combines a Whisper encoder with a Qwen2 LLM and supports a wide range of speech tasks, including speech recognition (ASR), speech recognition with timestamps (SRWT), vocal event detection (VED), speech emotion recognition (SER), speaking style recognition (SSR), speaker gender classification (SGC), speaker age prediction (SAP), and speech-to-text chat (STTC). By employing an ASR+X training strategy, OSUM achieves efficient and stable multi-task training by simultaneously optimizing ASR alongside target tasks. Beyond delivering strong performance, OSUM emphasizes transparency by providing openly available data preparation and training methodologies, offering valuable insights and practical guidance for the academic community. By doing so, we aim to accelerate research and innovation in advanced SULM technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13306v2-abstract-full').style.display = 'none'; document.getElementById('2501.13306v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">OSUM Technical Report v2. The experimental results reported herein differ from those in v1 because of adding new data and training in more steps</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11755">arXiv:2501.11755</a> <span> [<a href="https://arxiv.org/pdf/2501.11755">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A generalizable 3D framework and model for self-supervised learning in medical imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tony Xu</a>, <a href="/search/eess?searchtype=author&query=Hosseini%2C+S">Sepehr Hosseini</a>, <a href="/search/eess?searchtype=author&query=Anderson%2C+C">Chris Anderson</a>, <a href="/search/eess?searchtype=author&query=Rinaldi%2C+A">Anthony Rinaldi</a>, <a href="/search/eess?searchtype=author&query=Krishnan%2C+R+G">Rahul G. Krishnan</a>, <a href="/search/eess?searchtype=author&query=Martel%2C+A+L">Anne L. Martel</a>, <a href="/search/eess?searchtype=author&query=Goubran%2C+M">Maged Goubran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11755v1-abstract-short" style="display: inline;"> Current self-supervised learning methods for 3D medical imaging rely on simple pretext formulations and organ- or modality-specific datasets, limiting their generalizability and scalability. We present 3DINO, a cutting-edge SSL method adapted to 3D datasets, and use it to pretrain 3DINO-ViT: a general-purpose medical imaging model, on an exceptionally large, multimodal, and multi-organ dataset of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11755v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11755v1-abstract-full" style="display: none;"> Current self-supervised learning methods for 3D medical imaging rely on simple pretext formulations and organ- or modality-specific datasets, limiting their generalizability and scalability. We present 3DINO, a cutting-edge SSL method adapted to 3D datasets, and use it to pretrain 3DINO-ViT: a general-purpose medical imaging model, on an exceptionally large, multimodal, and multi-organ dataset of ~100,000 3D medical imaging scans from over 10 organs. We validate 3DINO-ViT using extensive experiments on numerous medical imaging segmentation and classification tasks. Our results demonstrate that 3DINO-ViT generalizes across modalities and organs, including out-of-distribution tasks and datasets, outperforming state-of-the-art methods on the majority of evaluation metrics and labeled dataset sizes. Our 3DINO framework and 3DINO-ViT will be made available to enable research on 3D foundation models or further finetuning for a wide range of medical imaging applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11755v1-abstract-full').style.display = 'none'; document.getElementById('2501.11755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03880">arXiv:2501.03880</a> <span> [<a href="https://arxiv.org/pdf/2501.03880">pdf</a>, <a href="https://arxiv.org/format/2501.03880">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SELMA3D challenge: Self-supervised learning for 3D light-sheet microscopy image segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/eess?searchtype=author&query=Al-Maskari%2C+R">Rami Al-Maskari</a>, <a href="/search/eess?searchtype=author&query=Horvath%2C+I">Izabela Horvath</a>, <a href="/search/eess?searchtype=author&query=Ali%2C+M">Mayar Ali</a>, <a href="/search/eess?searchtype=author&query=Hoher%2C+L">Luciano Hoher</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+K">Kaiyuan Yang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+Z">Zengming Lin</a>, <a href="/search/eess?searchtype=author&query=Zhai%2C+Z">Zhiwei Zhai</a>, <a href="/search/eess?searchtype=author&query=Shen%2C+M">Mengzhe Shen</a>, <a href="/search/eess?searchtype=author&query=Xun%2C+D">Dejin Xun</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tony Xu</a>, <a href="/search/eess?searchtype=author&query=Goubran%2C+M">Maged Goubran</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+Y">Yunheng Wu</a>, <a href="/search/eess?searchtype=author&query=Mori%2C+K">Kensaku Mori</a>, <a href="/search/eess?searchtype=author&query=Paetzold%2C+J+C">Johannes C. Paetzold</a>, <a href="/search/eess?searchtype=author&query=Erturk%2C+A">Ali Erturk</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03880v2-abstract-short" style="display: inline;"> Recent innovations in light sheet microscopy, paired with developments in tissue clearing techniques, enable the 3D imaging of large mammalian tissues with cellular resolution. Combined with the progress in large-scale data analysis, driven by deep learning, these innovations empower researchers to rapidly investigate the morphological and functional properties of diverse biological samples. Segme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03880v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03880v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03880v2-abstract-full" style="display: none;"> Recent innovations in light sheet microscopy, paired with developments in tissue clearing techniques, enable the 3D imaging of large mammalian tissues with cellular resolution. Combined with the progress in large-scale data analysis, driven by deep learning, these innovations empower researchers to rapidly investigate the morphological and functional properties of diverse biological samples. Segmentation, a crucial preliminary step in the analysis process, can be automated using domain-specific deep learning models with expert-level performance. However, these models exhibit high sensitivity to domain shifts, leading to a significant drop in accuracy when applied to data outside their training distribution. To address this limitation, and inspired by the recent success of self-supervised learning in training generalizable models, we organized the SELMA3D Challenge during the MICCAI 2024 conference. SELMA3D provides a vast collection of light-sheet images from cleared mice and human brains, comprising 35 large 3D images-each with over 1000^3 voxels-and 315 annotated small patches for finetuning, preliminary testing and final testing. The dataset encompasses diverse biological structures, including vessel-like and spot-like structures. Five teams participated in all phases of the challenge, and their proposed methods are reviewed in this paper. Quantitative and qualitative results from most participating teams demonstrate that self-supervised learning on large datasets improves segmentation model performance and generalization. We will continue to support and extend SELMA3D as an inaugural MICCAI challenge focused on self-supervised learning for 3D microscopy image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03880v2-abstract-full').style.display = 'none'; document.getElementById('2501.03880v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2st version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10822">arXiv:2412.10822</a> <span> [<a href="https://arxiv.org/pdf/2412.10822">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Automated Driving with Evolution Capability: A Reinforcement Learning Method with Monotonic Performance Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+J">Jia Hu</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+X">Xuerun Yan</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tian Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">Haoran Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10822v1-abstract-short" style="display: inline;"> Reinforcement Learning (RL) offers a promising solution to enable evolutionary automated driving. However, the conventional RL method is always concerned with risk performance. The updated policy may not obtain a performance enhancement, even leading to performance deterioration. To address this challenge, this research proposes a High Confidence Policy Improvement Reinforcement Learning-based (HC… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10822v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10822v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10822v1-abstract-full" style="display: none;"> Reinforcement Learning (RL) offers a promising solution to enable evolutionary automated driving. However, the conventional RL method is always concerned with risk performance. The updated policy may not obtain a performance enhancement, even leading to performance deterioration. To address this challenge, this research proposes a High Confidence Policy Improvement Reinforcement Learning-based (HCPI-RL) planner. It is intended to achieve the monotonic evolution of automated driving. A novel RL policy update paradigm is designed to enable the newly learned policy performance consistently surpass that of previous policies, which is deemed as monotonic performance enhancement. Hence, the proposed HCPI-RL planner has the following features: i) Evolutionary automated driving with monotonic performance enhancement; ii) With the capability of handling scenarios with emergency; iii) With enhanced decision-making optimality. Results demonstrate that the proposed HCPI-RL planner enhances the policy return by 44.7% in emergent cut-in scenarios, 108.2% in emergent braking scenarios, and 64.4% in daily cruising scenarios, compared to the PPO planner. Adopting the proposed planner, automated driving efficiency is enhanced by 19.2% compared to the PPO planner, and by 30.7% compared to the rule-based planner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10822v1-abstract-full').style.display = 'none'; document.getElementById('2412.10822v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 16figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09856">arXiv:2412.09856</a> <span> [<a href="https://arxiv.org/pdf/2412.09856">pdf</a>, <a href="https://arxiv.org/format/2412.09856">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> LinGen: Towards High-Resolution Minute-Length Text-to-Video Generation with Linear Computational Complexity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wang%2C+H">Hongjie Wang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+C">Chih-Yao Ma</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yen-Cheng Liu</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+J">Ji Hou</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialiang Wang</a>, <a href="/search/eess?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+Y">Yaqiao Luo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+P">Peizhao Zhang</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+T">Tingbo Hou</a>, <a href="/search/eess?searchtype=author&query=Vajda%2C+P">Peter Vajda</a>, <a href="/search/eess?searchtype=author&query=Jha%2C+N+K">Niraj K. Jha</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+X">Xiaoliang Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09856v1-abstract-short" style="display: inline;"> Text-to-video generation enhances content creation but is highly computationally intensive: The computational cost of Diffusion Transformers (DiTs) scales quadratically in the number of pixels. This makes minute-length video generation extremely expensive, limiting most existing models to generating videos of only 10-20 seconds length. We propose a Linear-complexity text-to-video Generation (LinGe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09856v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09856v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09856v1-abstract-full" style="display: none;"> Text-to-video generation enhances content creation but is highly computationally intensive: The computational cost of Diffusion Transformers (DiTs) scales quadratically in the number of pixels. This makes minute-length video generation extremely expensive, limiting most existing models to generating videos of only 10-20 seconds length. We propose a Linear-complexity text-to-video Generation (LinGen) framework whose cost scales linearly in the number of pixels. For the first time, LinGen enables high-resolution minute-length video generation on a single GPU without compromising quality. It replaces the computationally-dominant and quadratic-complexity block, self-attention, with a linear-complexity block called MATE, which consists of an MA-branch and a TE-branch. The MA-branch targets short-to-long-range correlations, combining a bidirectional Mamba2 block with our token rearrangement method, Rotary Major Scan, and our review tokens developed for long video generation. The TE-branch is a novel TEmporal Swin Attention block that focuses on temporal correlations between adjacent tokens and medium-range tokens. The MATE block addresses the adjacency preservation issue of Mamba and improves the consistency of generated videos significantly. Experimental results show that LinGen outperforms DiT (with a 75.6% win rate) in video quality with up to 15$\times$ (11.5$\times$) FLOPs (latency) reduction. Furthermore, both automatic metrics and human evaluation demonstrate our LinGen-4B yields comparable video quality to state-of-the-art models (with a 50.5%, 52.1%, 49.1% win rate with respect to Gen-3, LumaLabs, and Kling, respectively). This paves the way to hour-length movie generation and real-time interactive video generation. We provide 68s video generation results and more examples in our project website: https://lineargen.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09856v1-abstract-full').style.display = 'none'; document.getElementById('2412.09856v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01425">arXiv:2412.01425</a> <span> [<a href="https://arxiv.org/pdf/2412.01425">pdf</a>, <a href="https://arxiv.org/format/2412.01425">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Reject Threshold Adaptation for Open-Set Model Attribution of Deepfake Audio </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yan%2C+X">Xinrui Yan</a>, <a href="/search/eess?searchtype=author&query=Yi%2C+J">Jiangyan Yi</a>, <a href="/search/eess?searchtype=author&query=Tao%2C+J">Jianhua Tao</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yujie Chen</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+H">Hao Gu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+G">Guanjun Li</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Junzuo Zhou</a>, <a href="/search/eess?searchtype=author&query=Ren%2C+Y">Yong Ren</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01425v1-abstract-short" style="display: inline;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, threshol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01425v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01425v1-abstract-full" style="display: none;"> Open environment oriented open set model attribution of deepfake audio is an emerging research topic, aiming to identify the generation models of deepfake audio. Most previous work requires manually setting a rejection threshold for unknown classes to compare with predicted probabilities. However, models often overfit training instances and generate overly confident predictions. Moreover, thresholds that effectively distinguish unknown categories in the current dataset may not be suitable for identifying known and unknown categories in another data distribution. To address the issues, we propose a novel framework for open set model attribution of deepfake audio with rejection threshold adaptation (ReTA). Specifically, the reconstruction error learning module trains by combining the representation of system fingerprints with labels corresponding to either the target class or a randomly chosen other class label. This process generates matching and non-matching reconstructed samples, establishing the reconstruction error distributions for each class and laying the foundation for the reject threshold calculation module. The reject threshold calculation module utilizes gaussian probability estimation to fit the distributions of matching and non-matching reconstruction errors. It then computes adaptive reject thresholds for all classes through probability minimization criteria. The experimental results demonstrate the effectiveness of ReTA in improving the open set model attributes of deepfake audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01425v1-abstract-full').style.display = 'none'; document.getElementById('2412.01425v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12653">arXiv:2411.12653</a> <span> [<a href="https://arxiv.org/pdf/2411.12653">pdf</a>, <a href="https://arxiv.org/ps/2411.12653">ps</a>, <a href="https://arxiv.org/format/2411.12653">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Smart Predict-then-Optimize Method with Dependent Data: Risk Bounds and Calibration of Autoregression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jixian Liu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/eess?searchtype=author&query=He%2C+J">Jianping He</a>, <a href="/search/eess?searchtype=author&query=Fang%2C+C">Chongrong Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12653v1-abstract-short" style="display: inline;"> The predict-then-optimize (PTO) framework is indispensable for addressing practical stochastic decision-making tasks. It consists of two crucial steps: initially predicting unknown parameters of an optimization model and subsequently solving the problem based on these predictions. Elmachtoub and Grigas [1] introduced the Smart Predict-then-Optimize (SPO) loss for the framework, which gauges the de… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12653v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12653v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12653v1-abstract-full" style="display: none;"> The predict-then-optimize (PTO) framework is indispensable for addressing practical stochastic decision-making tasks. It consists of two crucial steps: initially predicting unknown parameters of an optimization model and subsequently solving the problem based on these predictions. Elmachtoub and Grigas [1] introduced the Smart Predict-then-Optimize (SPO) loss for the framework, which gauges the decision error arising from predicted parameters, and a convex surrogate, the SPO+ loss, which incorporates the underlying structure of the optimization model. The consistency of these different loss functions is guaranteed under the assumption of i.i.d. training data. Nevertheless, various types of data are often dependent, such as power load fluctuations over time. This dependent nature can lead to diminished model performance in testing or real-world applications. Motivated to make intelligent predictions for time series data, we present an autoregressive SPO method directly targeting the optimization problem at the decision stage in this paper, where the conditions of consistency are no longer met. Therefore, we first analyze the generalization bounds of the SPO loss within our autoregressive model. Subsequently, the uniform calibration results in Liu and Grigas [2] are extended in the proposed model. Finally, we conduct experiments to empirically demonstrate the effectiveness of the SPO+ surrogate compared to the absolute loss and the least squares loss, especially when the cost vectors are determined by stationary dynamical systems and demonstrate the relationship between normalized regret and mixing coefficients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12653v1-abstract-full').style.display = 'none'; document.getElementById('2411.12653v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/eess?searchtype=author&query=%3A"> :</a>, <a href="/search/eess?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/eess?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/eess?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/eess?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/eess?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/eess?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/eess?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/eess?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/eess?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/eess?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/eess?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/eess?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/eess?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/eess?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/eess?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/eess?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/eess?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/eess?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/eess?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/eess?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/eess?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/eess?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/eess?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13720">arXiv:2410.13720</a> <span> [<a href="https://arxiv.org/pdf/2410.13720">pdf</a>, <a href="https://arxiv.org/format/2410.13720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Movie Gen: A Cast of Media Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Polyak%2C+A">Adam Polyak</a>, <a href="/search/eess?searchtype=author&query=Zohar%2C+A">Amit Zohar</a>, <a href="/search/eess?searchtype=author&query=Brown%2C+A">Andrew Brown</a>, <a href="/search/eess?searchtype=author&query=Tjandra%2C+A">Andros Tjandra</a>, <a href="/search/eess?searchtype=author&query=Sinha%2C+A">Animesh Sinha</a>, <a href="/search/eess?searchtype=author&query=Lee%2C+A">Ann Lee</a>, <a href="/search/eess?searchtype=author&query=Vyas%2C+A">Apoorv Vyas</a>, <a href="/search/eess?searchtype=author&query=Shi%2C+B">Bowen Shi</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+C">Chih-Yao Ma</a>, <a href="/search/eess?searchtype=author&query=Chuang%2C+C">Ching-Yao Chuang</a>, <a href="/search/eess?searchtype=author&query=Yan%2C+D">David Yan</a>, <a href="/search/eess?searchtype=author&query=Choudhary%2C+D">Dhruv Choudhary</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+D">Dingkang Wang</a>, <a href="/search/eess?searchtype=author&query=Sethi%2C+G">Geet Sethi</a>, <a href="/search/eess?searchtype=author&query=Pang%2C+G">Guan Pang</a>, <a href="/search/eess?searchtype=author&query=Ma%2C+H">Haoyu Ma</a>, <a href="/search/eess?searchtype=author&query=Misra%2C+I">Ishan Misra</a>, <a href="/search/eess?searchtype=author&query=Hou%2C+J">Ji Hou</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jialiang Wang</a>, <a href="/search/eess?searchtype=author&query=Jagadeesh%2C+K">Kiran Jagadeesh</a>, <a href="/search/eess?searchtype=author&query=Li%2C+K">Kunpeng Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Luxin Zhang</a>, <a href="/search/eess?searchtype=author&query=Singh%2C+M">Mannat Singh</a>, <a href="/search/eess?searchtype=author&query=Williamson%2C+M">Mary Williamson</a>, <a href="/search/eess?searchtype=author&query=Le%2C+M">Matt Le</a> , et al. (63 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13720v1-abstract-short" style="display: inline;"> We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13720v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13720v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13720v1-abstract-full" style="display: none;"> We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization, video editing, video-to-audio generation, and text-to-audio generation. Our largest video generation model is a 30B parameter transformer trained with a maximum context length of 73K video tokens, corresponding to a generated video of 16 seconds at 16 frames-per-second. We show multiple technical innovations and simplifications on the architecture, latent spaces, training objectives and recipes, data curation, evaluation protocols, parallelization techniques, and inference optimizations that allow us to reap the benefits of scaling pre-training data, model size, and training compute for training large scale media generation models. We hope this paper helps the research community to accelerate progress and innovation in media generation models. All videos from this paper are available at https://go.fb.me/MovieGenResearchVideos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13720v1-abstract-full').style.display = 'none'; document.getElementById('2410.13720v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18783">arXiv:2409.18783</a> <span> [<a href="https://arxiv.org/pdf/2409.18783">pdf</a>, <a href="https://arxiv.org/format/2409.18783">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DualDn: Dual-domain Denoising via Differentiable ISP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+R">Ruikang Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yujin Wang</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+S">Shiqi Chen</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18783v2-abstract-short" style="display: inline;"> Image denoising is a critical component in a camera's Image Signal Processing (ISP) pipeline. There are two typical ways to inject a denoiser into the ISP pipeline: applying a denoiser directly to captured raw frames (raw domain) or to the ISP's output sRGB images (sRGB domain). However, both approaches have their limitations. Residual noise from raw-domain denoising can be amplified by the subseq… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18783v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18783v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18783v2-abstract-full" style="display: none;"> Image denoising is a critical component in a camera's Image Signal Processing (ISP) pipeline. There are two typical ways to inject a denoiser into the ISP pipeline: applying a denoiser directly to captured raw frames (raw domain) or to the ISP's output sRGB images (sRGB domain). However, both approaches have their limitations. Residual noise from raw-domain denoising can be amplified by the subsequent ISP processing, and the sRGB domain struggles to handle spatially varying noise since it only sees noise distorted by the ISP. Consequently, most raw or sRGB domain denoising works only for specific noise distributions and ISP configurations. To address these challenges, we propose DualDn, a novel learning-based dual-domain denoising. Unlike previous single-domain denoising, DualDn consists of two denoising networks: one in the raw domain and one in the sRGB domain. The raw domain denoising adapts to sensor-specific noise as well as spatially varying noise levels, while the sRGB domain denoising adapts to ISP variations and removes residual noise amplified by the ISP. Both denoising networks are connected with a differentiable ISP, which is trained end-to-end and discarded during the inference stage. With this design, DualDn achieves greater generalizability compared to most learning-based denoising methods, as it can adapt to different unseen noises, ISP parameters, and even novel ISP pipelines. Experiments show that DualDn achieves state-of-the-art performance and can adapt to different denoising architectures. Moreover, DualDn can be used as a plug-and-play denoising module with real cameras without retraining, and still demonstrate better performance than commercial on-camera denoising. The project website is available at: https://openimaginglab.github.io/DualDn/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18783v2-abstract-full').style.display = 'none'; document.getElementById('2409.18783v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024, Project page: https://openimaginglab.github.io/DualDn/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17996">arXiv:2409.17996</a> <span> [<a href="https://arxiv.org/pdf/2409.17996">pdf</a>, <a href="https://arxiv.org/format/2409.17996">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PhoCoLens: Photorealistic and Consistent Reconstruction in Lensless Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cai%2C+X">Xin Cai</a>, <a href="/search/eess?searchtype=author&query=You%2C+Z">Zhiyuan You</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hailong Zhang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+W">Wentao Liu</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+J">Jinwei Gu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tianfan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17996v2-abstract-short" style="display: inline;"> Lensless cameras offer significant advantages in size, weight, and cost compared to traditional lens-based systems. Without a focusing lens, lensless cameras rely on computational algorithms to recover the scenes from multiplexed measurements. However, current algorithms struggle with inaccurate forward imaging models and insufficient priors to reconstruct high-quality images. To overcome these li… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17996v2-abstract-full').style.display = 'inline'; document.getElementById('2409.17996v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17996v2-abstract-full" style="display: none;"> Lensless cameras offer significant advantages in size, weight, and cost compared to traditional lens-based systems. Without a focusing lens, lensless cameras rely on computational algorithms to recover the scenes from multiplexed measurements. However, current algorithms struggle with inaccurate forward imaging models and insufficient priors to reconstruct high-quality images. To overcome these limitations, we introduce a novel two-stage approach for consistent and photorealistic lensless image reconstruction. The first stage of our approach ensures data consistency by focusing on accurately reconstructing the low-frequency content with a spatially varying deconvolution method that adjusts to changes in the Point Spread Function (PSF) across the camera's field of view. The second stage enhances photorealism by incorporating a generative prior from pre-trained diffusion models. By conditioning on the low-frequency content retrieved in the first stage, the diffusion model effectively reconstructs the high-frequency details that are typically lost in the lensless imaging process, while also maintaining image fidelity. Our method achieves a superior balance between data fidelity and visual quality compared to existing methods, as demonstrated with two popular lensless systems, PhlatCam and DiffuserCam. Project website: https://phocolens.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17996v2-abstract-full').style.display = 'none'; document.getElementById('2409.17996v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024 Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03005">arXiv:2409.03005</a> <span> [<a href="https://arxiv.org/pdf/2409.03005">pdf</a>, <a href="https://arxiv.org/format/2409.03005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> PIETRA: Physics-Informed Evidential Learning for Traversing Out-of-Distribution Terrain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Cai%2C+X">Xiaoyi Cai</a>, <a href="/search/eess?searchtype=author&query=Queeney%2C+J">James Queeney</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/eess?searchtype=author&query=Datar%2C+A">Aniket Datar</a>, <a href="/search/eess?searchtype=author&query=Pan%2C+C">Chenhui Pan</a>, <a href="/search/eess?searchtype=author&query=Miller%2C+M">Max Miller</a>, <a href="/search/eess?searchtype=author&query=Flather%2C+A">Ashton Flather</a>, <a href="/search/eess?searchtype=author&query=Osteen%2C+P+R">Philip R. Osteen</a>, <a href="/search/eess?searchtype=author&query=Roy%2C+N">Nicholas Roy</a>, <a href="/search/eess?searchtype=author&query=Xiao%2C+X">Xuesu Xiao</a>, <a href="/search/eess?searchtype=author&query=How%2C+J+P">Jonathan P. How</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03005v2-abstract-short" style="display: inline;"> Self-supervised learning is a powerful approach for developing traversability models for off-road navigation, but these models often struggle with inputs unseen during training. Existing methods utilize techniques like evidential deep learning to quantify model uncertainty, helping to identify and avoid out-of-distribution terrain. However, always avoiding out-of-distribution terrain can be overly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03005v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03005v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03005v2-abstract-full" style="display: none;"> Self-supervised learning is a powerful approach for developing traversability models for off-road navigation, but these models often struggle with inputs unseen during training. Existing methods utilize techniques like evidential deep learning to quantify model uncertainty, helping to identify and avoid out-of-distribution terrain. However, always avoiding out-of-distribution terrain can be overly conservative, e.g., when novel terrain can be effectively analyzed using a physics-based model. To overcome this challenge, we introduce Physics-Informed Evidential Traversability (PIETRA), a self-supervised learning framework that integrates physics priors directly into the mathematical formulation of evidential neural networks and introduces physics knowledge implicitly through an uncertainty-aware, physics-informed training loss. Our evidential network seamlessly transitions between learned and physics-based predictions for out-of-distribution inputs. Additionally, the physics-informed loss regularizes the learned model, ensuring better alignment with the physics model. Extensive simulations and hardware experiments demonstrate that PIETRA improves both learning accuracy and navigation performance in environments with significant distribution shifts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03005v2-abstract-full').style.display = 'none'; document.getElementById('2409.03005v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in RA-L. Video: https://youtu.be/OTnNZ96oJRk</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10680">arXiv:2408.10680</a> <span> [<a href="https://arxiv.org/pdf/2408.10680">pdf</a>, <a href="https://arxiv.org/format/2408.10680">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Towards Rehearsal-Free Multilingual ASR: A LoRA-based Case Study on Whisper </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+Y">Yu Zhou</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+L">Longtao Huang</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+H">Hui Xue</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10680v1-abstract-short" style="display: inline;"> Pre-trained multilingual speech foundation models, like Whisper, have shown impressive performance across different languages. However, adapting these models to new or specific languages is computationally extensive and faces catastrophic forgetting problems. Addressing these issues, our study investigates strategies to enhance the model on new languages in the absence of original training data, w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10680v1-abstract-full').style.display = 'inline'; document.getElementById('2408.10680v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10680v1-abstract-full" style="display: none;"> Pre-trained multilingual speech foundation models, like Whisper, have shown impressive performance across different languages. However, adapting these models to new or specific languages is computationally extensive and faces catastrophic forgetting problems. Addressing these issues, our study investigates strategies to enhance the model on new languages in the absence of original training data, while also preserving the established performance on the original languages. Specifically, we first compare various LoRA-based methods to find out their vulnerability to forgetting. To mitigate this issue, we propose to leverage the LoRA parameters from the original model for approximate orthogonal gradient descent on the new samples. Additionally, we also introduce a learnable rank coefficient to allocate trainable parameters for more efficient training. Our experiments with a Chinese Whisper model (for Uyghur and Tibetan) yield better results with a more compact parameter set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10680v1-abstract-full').style.display = 'none'; document.getElementById('2408.10680v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.06776">arXiv:2408.06776</a> <span> [<a href="https://arxiv.org/pdf/2408.06776">pdf</a>, <a href="https://arxiv.org/format/2408.06776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Robust Deep Reinforcement Learning for Inverter-based Volt-Var Control in Partially Observable Distribution Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Liu%2C+Q">Qiong Liu</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+Y">Ye Guo</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.06776v1-abstract-short" style="display: inline;"> Inverter-based volt-var control is studied in this paper. One key issue in DRL-based approaches is the limited measurement deployment in active distribution networks, which leads to problems of a partially observable state and unknown reward. To address those problems, this paper proposes a robust DRL approach with a conservative critic and a surrogate reward. The conservative critic utilizes the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06776v1-abstract-full').style.display = 'inline'; document.getElementById('2408.06776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.06776v1-abstract-full" style="display: none;"> Inverter-based volt-var control is studied in this paper. One key issue in DRL-based approaches is the limited measurement deployment in active distribution networks, which leads to problems of a partially observable state and unknown reward. To address those problems, this paper proposes a robust DRL approach with a conservative critic and a surrogate reward. The conservative critic utilizes the quantile regression technology to estimate conservative state-action value function based on the partially observable state, which helps to train a robust policy; the surrogate rewards of power loss and voltage violation are designed that can be calculated from the limited measurements. The proposed approach optimizes the power loss of the whole network and the voltage profile of buses with measurable voltages while indirectly improving the voltage profile of other buses. Extensive simulations verify the effectiveness of the robust DRL approach in different limited measurement conditions, even when only the active power injection of the root bus and less than 10% of bus voltages are measurable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.06776v1-abstract-full').style.display = 'none'; document.getElementById('2408.06776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02074">arXiv:2408.02074</a> <span> [<a href="https://arxiv.org/pdf/2408.02074">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Applying Conditional Generative Adversarial Networks for Imaging Diagnosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haowei Yang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/eess?searchtype=author&query=He%2C+S">Shuyao He</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Ting Xu</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jiajie Yuan</a>, <a href="/search/eess?searchtype=author&query=Gu%2C+X">Xingxin Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02074v1-abstract-short" style="display: inline;"> This study introduces an innovative application of Conditional Generative Adversarial Networks (C-GAN) integrated with Stacked Hourglass Networks (SHGN) aimed at enhancing image segmentation, particularly in the challenging environment of medical imaging. We address the problem of overfitting, common in deep learning models applied to complex imaging datasets, by augmenting data through rotation a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02074v1-abstract-full').style.display = 'inline'; document.getElementById('2408.02074v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02074v1-abstract-full" style="display: none;"> This study introduces an innovative application of Conditional Generative Adversarial Networks (C-GAN) integrated with Stacked Hourglass Networks (SHGN) aimed at enhancing image segmentation, particularly in the challenging environment of medical imaging. We address the problem of overfitting, common in deep learning models applied to complex imaging datasets, by augmenting data through rotation and scaling. A hybrid loss function combining L1 and L2 reconstruction losses, enriched with adversarial training, is introduced to refine segmentation processes in intravascular ultrasound (IVUS) imaging. Our approach is unique in its capacity to accurately delineate distinct regions within medical images, such as tissue boundaries and vascular structures, without extensive reliance on domain-specific knowledge. The algorithm was evaluated using a standard medical image library, showing superior performance metrics compared to existing methods, thereby demonstrating its potential in enhancing automated medical diagnostics through deep learning <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02074v1-abstract-full').style.display = 'none'; document.getElementById('2408.02074v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.18593">arXiv:2407.18593</a> <span> [<a href="https://arxiv.org/pdf/2407.18593">pdf</a>, <a href="https://arxiv.org/format/2407.18593">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TGRS.2024.3435079">10.1109/TGRS.2024.3435079 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Content-driven Magnitude-Derivative Spectrum Complementary Learning for Hyperspectral Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Bai%2C+H">Huiyan Bai</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tingfa Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+H">Huan Chen</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peifu Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jianan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.18593v1-abstract-short" style="display: inline;"> Extracting discriminative information from complex spectral details in hyperspectral image (HSI) for HSI classification is pivotal. While current prevailing methods rely on spectral magnitude features, they could cause confusion in certain classes, resulting in misclassification and decreased accuracy. We find that the derivative spectrum proves more adept at capturing concealed information, there… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18593v1-abstract-full').style.display = 'inline'; document.getElementById('2407.18593v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.18593v1-abstract-full" style="display: none;"> Extracting discriminative information from complex spectral details in hyperspectral image (HSI) for HSI classification is pivotal. While current prevailing methods rely on spectral magnitude features, they could cause confusion in certain classes, resulting in misclassification and decreased accuracy. We find that the derivative spectrum proves more adept at capturing concealed information, thereby offering a distinct advantage in separating these confusion classes. Leveraging the complementarity between spectral magnitude and derivative features, we propose a Content-driven Spectrum Complementary Network based on Magnitude-Derivative Dual Encoder, employing these two features as combined inputs. To fully utilize their complementary information, we raise a Content-adaptive Point-wise Fusion Module, enabling adaptive fusion of dual-encoder features in a point-wise selective manner, contingent upon feature representation. To preserve a rich source of complementary information while extracting more distinguishable features, we introduce a Hybrid Disparity-enhancing Loss that enhances the differential expression of the features from the two branches and increases the inter-class distance. As a result, our method achieves state-of-the-art results on the extensive WHU-OHS dataset and eight other benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.18593v1-abstract-full').style.display = 'none'; document.getElementById('2407.18593v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by TGRS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07667">arXiv:2407.07667</a> <span> [<a href="https://arxiv.org/pdf/2407.07667">pdf</a>, <a href="https://arxiv.org/format/2407.07667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> VEnhancer: Generative Space-Time Enhancement for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=He%2C+J">Jingwen He</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tianfan Xue</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+D">Dongyang Liu</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+X">Xinqi Lin</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+D">Dahua Lin</a>, <a href="/search/eess?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/eess?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07667v1-abstract-short" style="display: inline;"> We present VEnhancer, a generative space-time enhancement framework that improves the existing text-to-video results by adding more details in spatial domain and synthetic detailed motion in temporal domain. Given a generated low-quality video, our approach can increase its spatial and temporal resolution simultaneously with arbitrary up-sampling space and time scales through a unified video diffu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07667v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07667v1-abstract-full" style="display: none;"> We present VEnhancer, a generative space-time enhancement framework that improves the existing text-to-video results by adding more details in spatial domain and synthetic detailed motion in temporal domain. Given a generated low-quality video, our approach can increase its spatial and temporal resolution simultaneously with arbitrary up-sampling space and time scales through a unified video diffusion model. Furthermore, VEnhancer effectively removes generated spatial artifacts and temporal flickering of generated videos. To achieve this, basing on a pretrained video diffusion model, we train a video ControlNet and inject it to the diffusion model as a condition on low frame-rate and low-resolution videos. To effectively train this video ControlNet, we design space-time data augmentation as well as video-aware conditioning. Benefiting from the above designs, VEnhancer yields to be stable during training and shares an elegant end-to-end training manner. Extensive experiments show that VEnhancer surpasses existing state-of-the-art video super-resolution and space-time super-resolution methods in enhancing AI-generated videos. Moreover, with VEnhancer, exisiting open-source state-of-the-art text-to-video method, VideoCrafter-2, reaches the top one in video generation benchmark -- VBench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07667v1-abstract-full').style.display = 'none'; document.getElementById('2407.07667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01530">arXiv:2407.01530</a> <span> [<a href="https://arxiv.org/pdf/2407.01530">pdf</a>, <a href="https://arxiv.org/format/2407.01530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> xLSTM-UNet can be an Effective 2D & 3D Medical Image Segmentation Backbone with Vision-LSTM (ViL) better than its Mamba Counterpart </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+T">Tianrun Chen</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+C">Chaotao Ding</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+D">Deyi Ji</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Zang%2C+Y">Ying Zang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Z">Zejian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01530v2-abstract-short" style="display: inline;"> Convolutional Neural Networks (CNNs) and Vision Transformers (ViT) have been pivotal in biomedical image segmentation, yet their ability to manage long-range dependencies remains constrained by inherent locality and computational overhead. To overcome these challenges, in this technical report, we first propose xLSTM-UNet, a UNet structured deep learning neural network that leverages Vision-LSTM (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01530v2-abstract-full').style.display = 'inline'; document.getElementById('2407.01530v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01530v2-abstract-full" style="display: none;"> Convolutional Neural Networks (CNNs) and Vision Transformers (ViT) have been pivotal in biomedical image segmentation, yet their ability to manage long-range dependencies remains constrained by inherent locality and computational overhead. To overcome these challenges, in this technical report, we first propose xLSTM-UNet, a UNet structured deep learning neural network that leverages Vision-LSTM (xLSTM) as its backbone for medical image segmentation. xLSTM is a recently proposed as the successor of Long Short-Term Memory (LSTM) networks and have demonstrated superior performance compared to Transformers and State Space Models (SSMs) like Mamba in Neural Language Processing (NLP) and image classification (as demonstrated in Vision-LSTM, or ViL implementation). Here, xLSTM-UNet we designed extend the success in biomedical image segmentation domain. By integrating the local feature extraction strengths of convolutional layers with the long-range dependency capturing abilities of xLSTM, xLSTM-UNet offers a robust solution for comprehensive image analysis. We validate the efficacy of xLSTM-UNet through experiments. Our findings demonstrate that xLSTM-UNet consistently surpasses the performance of leading CNN-based, Transformer-based, and Mamba-based segmentation networks in multiple datasets in biomedical segmentation including organs in abdomen MRI, instruments in endoscopic images, and cells in microscopic images. With comprehensive experiments performed, this technical report highlights the potential of xLSTM-based architectures in advancing biomedical image analysis in both 2D and 3D. The code, models, and datasets are publicly available at http://tianrun-chen.github.io/xLSTM-UNet/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01530v2-abstract-full').style.display = 'none'; document.getElementById('2407.01530v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18548">arXiv:2406.18548</a> <span> [<a href="https://arxiv.org/pdf/2406.18548">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploration of Multi-Scale Image Fusion Systems in Intelligent Medical Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+H">Haowei Yang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Ting Xu</a>, <a href="/search/eess?searchtype=author&query=He%2C+S">Shuyao He</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jiajie Yuan</a>, <a href="/search/eess?searchtype=author&query=Deng%2C+H">Haozhang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18548v1-abstract-short" style="display: inline;"> The diagnosis of brain cancer relies heavily on medical imaging techniques, with MRI being the most commonly used. It is necessary to perform automatic segmentation of brain tumors on MRI images. This project intends to build an MRI algorithm based on U-Net. The residual network and the module used to enhance the context information are combined, and the void space convolution pooling pyramid is a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18548v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18548v1-abstract-full" style="display: none;"> The diagnosis of brain cancer relies heavily on medical imaging techniques, with MRI being the most commonly used. It is necessary to perform automatic segmentation of brain tumors on MRI images. This project intends to build an MRI algorithm based on U-Net. The residual network and the module used to enhance the context information are combined, and the void space convolution pooling pyramid is added to the network for processing. The brain glioma MRI image dataset provided by cancer imaging archives was experimentally verified. A multi-scale segmentation method based on a weighted least squares filter was used to complete the 3D reconstruction of brain tumors. Thus, the accuracy of three-dimensional reconstruction is further improved. Experiments show that the local texture features obtained by the proposed algorithm are similar to those obtained by laser scanning. The algorithm is improved by using the U-Net method and an accuracy of 0.9851 is obtained. This approach significantly enhances the precision of image segmentation and boosts the efficiency of image classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18548v1-abstract-full').style.display = 'none'; document.getElementById('2406.18548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.04776">arXiv:2406.04776</a> <span> [<a href="https://arxiv.org/pdf/2406.04776">pdf</a>, <a href="https://arxiv.org/ps/2406.04776">ps</a>, <a href="https://arxiv.org/format/2406.04776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OFDM-Standard Compatible SC-NOFS Waveforms for Low-Latency and Jitter-Tolerance Industrial IoT Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongyang Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuangyang Li</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.04776v1-abstract-short" style="display: inline;"> Traditional communications focus on regular and orthogonal signal waveforms for simplified signal processing and improved spectral efficiency. In contrast, the next-generation communications would aim for irregular and non-orthogonal signal waveforms to introduce new capabilities. This work proposes a spectrally efficient irregular Sinc (irSinc) shaping technique, revisiting the traditional Sinc b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04776v1-abstract-full').style.display = 'inline'; document.getElementById('2406.04776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.04776v1-abstract-full" style="display: none;"> Traditional communications focus on regular and orthogonal signal waveforms for simplified signal processing and improved spectral efficiency. In contrast, the next-generation communications would aim for irregular and non-orthogonal signal waveforms to introduce new capabilities. This work proposes a spectrally efficient irregular Sinc (irSinc) shaping technique, revisiting the traditional Sinc back to 1924, with the aim of enhancing performance in industrial Internet of things (IIoT). In time-critical IIoT applications, low-latency and time-jitter tolerance are two critical factors that significantly impact the performance and reliability. Recognizing the inevitability of latency and jitter in practice, this work aims to propose a waveform technique to mitigate these effects via reducing latency and enhancing the system robustness under time jitter effects. The utilization of irSinc yields a signal with increased spectral efficiency without sacrificing error performance. Integrating the irSinc in a two-stage framework, a single-carrier non-orthogonal frequency shaping (SC-NOFS) waveform is developed, showcasing perfect compatibility with 5G standards, enabling the direct integration of irSinc in existing industrial IoT setups. Through 5G standard signal configuration, our signal achieves faster data transmission within the same spectral bandwidth. Hardware experiments validate an 18% saving in timing resources, leading to either reduced latency or enhanced jitter tolerance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.04776v1-abstract-full').style.display = 'none'; document.getElementById('2406.04776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.15153">arXiv:2405.15153</a> <span> [<a href="https://arxiv.org/pdf/2405.15153">pdf</a>, <a href="https://arxiv.org/format/2405.15153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Optimal Reference Nodes Deployment for Positioning Seafloor Anchor Nodes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianhe Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+K">Kaitao Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.15153v1-abstract-short" style="display: inline;"> Seafloor anchor nodes, which form a geodetic network, are designed to provide surface and underwater users with positioning, navigation and timing (PNT) services. Due to the non-uniform distribution of underwater sound speed, accurate positioning of underwater anchor nodes is a challenge work. Traditional anchor node positioning typically uses cross or circular shapes, however, how to optimize the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15153v1-abstract-full').style.display = 'inline'; document.getElementById('2405.15153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.15153v1-abstract-full" style="display: none;"> Seafloor anchor nodes, which form a geodetic network, are designed to provide surface and underwater users with positioning, navigation and timing (PNT) services. Due to the non-uniform distribution of underwater sound speed, accurate positioning of underwater anchor nodes is a challenge work. Traditional anchor node positioning typically uses cross or circular shapes, however, how to optimize the deployment of reference nodes for positioning underwater anchor nodes considering the variability of sound speed has not yet been studied. This paper focuses on the optimal reference nodes deployment strategies for time--of--arrival (TOA) localization in the three-dimensional (3D) underwater space. We adopt the criterion that minimizing the trace of the inverse Fisher information matrix (FIM) to determine optimal reference nodes deployment with Gaussian measurement noise, which is positive related to the signal propagation path. A comprehensive analysis of optimal reference-target geometries is provided in the general circumstance with no restriction on the number of reference nodes, elevation angle and reference-target range. A new semi-closed form solution is found to detemine the optimal geometries. To demonstrate the findings in this paper, we conducted both simulations and sea trials on underwater anchor node positioning. Both the simulation and experiment results are consistent with theoretical analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.15153v1-abstract-full').style.display = 'none'; document.getElementById('2405.15153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07685">arXiv:2405.07685</a> <span> [<a href="https://arxiv.org/pdf/2405.07685">pdf</a>, <a href="https://arxiv.org/format/2405.07685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Edge Computing for IoT: Novel Insights from a Comparative Analysis of Access Control Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tao Xue</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yanbin Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+W">Wenbo Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Shuailou Li</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haibin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07685v3-abstract-short" style="display: inline;"> IoT edge computing positions computing resources closer to the data sources to reduce the latency, relieve the bandwidth pressure on the cloud, and enhance data security. Nevertheless, data security in IoT edge computing still faces critical threats (e.g., data breaches). Access control is fundamental for mitigating these threats. However, IoT edge computing introduces notable challenges for achie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07685v3-abstract-full').style.display = 'inline'; document.getElementById('2405.07685v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07685v3-abstract-full" style="display: none;"> IoT edge computing positions computing resources closer to the data sources to reduce the latency, relieve the bandwidth pressure on the cloud, and enhance data security. Nevertheless, data security in IoT edge computing still faces critical threats (e.g., data breaches). Access control is fundamental for mitigating these threats. However, IoT edge computing introduces notable challenges for achieving resource-conserving, low-latency, flexible, and scalable access control. To review recent access control measures, we novelly organize them according to different data lifecycles--data collection, storage, and usage--and, meanwhile, review blockchain technology in this novel organization. In this way, we provide novel insights and envisage several potential research directions. This survey can help readers find gaps systematically and prompt the development of access control techniques in IoT edge computing under the intricacy of innovations in access control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07685v3-abstract-full').style.display = 'none'; document.getElementById('2405.07685v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02801">arXiv:2405.02801</a> <span> [<a href="https://arxiv.org/pdf/2405.02801">pdf</a>, <a href="https://arxiv.org/format/2405.02801">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Mozart's Touch: A Lightweight Multi-modal Music Generation Framework Based on Pre-Trained Large Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Li%2C+J">Jiajun Li</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianze Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+X">Xuesong Chen</a>, <a href="/search/eess?searchtype=author&query=Yao%2C+X">Xinrui Yao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+S">Shuchang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02801v3-abstract-short" style="display: inline;"> In recent years, AI-Generated Content (AIGC) has witnessed rapid advancements, facilitating the creation of music, images, and other artistic forms across a wide range of industries. However, current models for image- and video-to-music synthesis struggle to capture the nuanced emotions and atmosphere conveyed by visual content. To fill this gap, we propose Mozart's Touch, a multi-modal music gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02801v3-abstract-full').style.display = 'inline'; document.getElementById('2405.02801v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02801v3-abstract-full" style="display: none;"> In recent years, AI-Generated Content (AIGC) has witnessed rapid advancements, facilitating the creation of music, images, and other artistic forms across a wide range of industries. However, current models for image- and video-to-music synthesis struggle to capture the nuanced emotions and atmosphere conveyed by visual content. To fill this gap, we propose Mozart's Touch, a multi-modal music generation framework capable of generating music aligned with cross-modal inputs such as images, videos, and text. The framework consists of three key components: Multi-modal Captioning Module, Large Language Model (LLM) understanding \& Bridging Module, and Music Generation Module. Unlike traditional end-to-end methods, Mozart's Touch uses LLMs to accurately interpret visual elements without requiring the training or fine-tuning of music generation models, providing efficiency and transparency through clear, interpretable prompts. We also introduce the "LLM-Bridge" method to resolve the heterogeneous representation challenges between descriptive texts from different modalities. Through a series of objective and subjective evaluations, we demonstrate that Mozart's Touch outperforms current state-of-the-art models. Our code and examples are available at https://github.com/TiffanyBlews/MozartsTouch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02801v3-abstract-full').style.display = 'none'; document.getElementById('2405.02801v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 2 figures, submitted to AIGC 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02132">arXiv:2405.02132</a> <span> [<a href="https://arxiv.org/pdf/2405.02132">pdf</a>, <a href="https://arxiv.org/format/2405.02132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Geng%2C+X">Xuelong Geng</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+K">Kun Wei</a>, <a href="/search/eess?searchtype=author&query=Mu%2C+B">Bingshen Mu</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+H">Hongfei Xue</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+H">He Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yangze Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Dai%2C+Y">Yuhang Dai</a>, <a href="/search/eess?searchtype=author&query=Li%2C+L">Longhao Li</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+M">Mingchen Shao</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02132v3-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated unparalleled effectiveness in various NLP tasks, and integrating LLMs with automatic speech recognition (ASR) is becoming a mainstream paradigm. Building upon this momentum, our research delves into an in-depth examination of this paradigm on a large open-source Chinese dataset. Specifically, our research aims to evaluate the impact of various configu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02132v3-abstract-full').style.display = 'inline'; document.getElementById('2405.02132v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02132v3-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated unparalleled effectiveness in various NLP tasks, and integrating LLMs with automatic speech recognition (ASR) is becoming a mainstream paradigm. Building upon this momentum, our research delves into an in-depth examination of this paradigm on a large open-source Chinese dataset. Specifically, our research aims to evaluate the impact of various configurations of speech encoders, LLMs, and projector modules in the context of the speech foundation encoder-LLM ASR paradigm. Furthermore, we introduce a three-stage training approach, expressly developed to enhance the model's ability to align auditory and textual information. The implementation of this approach, alongside the strategic integration of ASR components, enabled us to achieve the SOTA performance on the AISHELL-1, Test_Net, and Test_Meeting test sets. Our analysis presents an empirical foundation for future research in LLM-based ASR systems and offers insights into optimizing performance using Chinese datasets. We will publicly release all scripts used for data preparation, training, inference, and scoring, as well as pre-trained models and training logs to promote reproducible research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02132v3-abstract-full').style.display = 'none'; document.getElementById('2405.02132v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04848">arXiv:2404.04848</a> <span> [<a href="https://arxiv.org/pdf/2404.04848">pdf</a>, <a href="https://arxiv.org/format/2404.04848">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Task-Aware Encoder Control for Deep Video Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Ge%2C+X">Xingtong Ge</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+J">Jixiang Luo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+G">Guo Lu</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+J">Jing Geng</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+H">Hongwei Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04848v2-abstract-short" style="display: inline;"> Prior research on deep video compression (DVC) for machine tasks typically necessitates training a unique codec for each specific task, mandating a dedicated decoder per task. In contrast, traditional video codecs employ a flexible encoder controller, enabling the adaptation of a single codec to different tasks through mechanisms like mode prediction. Drawing inspiration from this, we introduce an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04848v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04848v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04848v2-abstract-full" style="display: none;"> Prior research on deep video compression (DVC) for machine tasks typically necessitates training a unique codec for each specific task, mandating a dedicated decoder per task. In contrast, traditional video codecs employ a flexible encoder controller, enabling the adaptation of a single codec to different tasks through mechanisms like mode prediction. Drawing inspiration from this, we introduce an innovative encoder controller for deep video compression for machines. This controller features a mode prediction and a Group of Pictures (GoP) selection module. Our approach centralizes control at the encoding stage, allowing for adaptable encoder adjustments across different tasks, such as detection and tracking, while maintaining compatibility with a standard pre-trained DVC decoder. Empirical evidence demonstrates that our method is applicable across multiple tasks with various existing pre-trained DVCs. Moreover, extensive experiments demonstrate that our method outperforms previous DVC by about 25% bitrate for different tasks, with only one pre-trained decoder. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04848v2-abstract-full').style.display = 'none'; document.getElementById('2404.04848v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08551">arXiv:2403.08551</a> <span> [<a href="https://arxiv.org/pdf/2403.08551">pdf</a>, <a href="https://arxiv.org/format/2403.08551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> GaussianImage: 1000 FPS Image Representation and Compression by 2D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+X">Xingtong Ge</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+H">Hongwei Qin</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+G">Guo Lu</a>, <a href="/search/eess?searchtype=author&query=Geng%2C+J">Jing Geng</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08551v5-abstract-short" style="display: inline;"> Implicit neural representations (INRs) recently achieved great success in image representation and compression, offering high visual quality and fast rendering speeds with 10-1000 FPS, assuming sufficient GPU resources are available. However, this requirement often hinders their use on low-end devices with limited memory. In response, we propose a groundbreaking paradigm of image representation an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08551v5-abstract-full').style.display = 'inline'; document.getElementById('2403.08551v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08551v5-abstract-full" style="display: none;"> Implicit neural representations (INRs) recently achieved great success in image representation and compression, offering high visual quality and fast rendering speeds with 10-1000 FPS, assuming sufficient GPU resources are available. However, this requirement often hinders their use on low-end devices with limited memory. In response, we propose a groundbreaking paradigm of image representation and compression by 2D Gaussian Splatting, named GaussianImage. We first introduce 2D Gaussian to represent the image, where each Gaussian has 8 parameters including position, covariance and color. Subsequently, we unveil a novel rendering algorithm based on accumulated summation. Remarkably, our method with a minimum of 3$\times$ lower GPU memory usage and 5$\times$ faster fitting time not only rivals INRs (e.g., WIRE, I-NGP) in representation performance, but also delivers a faster rendering speed of 1500-2000 FPS regardless of parameter size. Furthermore, we integrate existing vector quantization technique to build an image codec. Experimental results demonstrate that our codec attains rate-distortion performance comparable to compression-based INRs such as COIN and COIN++, while facilitating decoding speeds of approximately 2000 FPS. Additionally, preliminary proof of concept shows that our codec surpasses COIN and COIN++ in performance when using partial bits-back coding. Code is available at https://github.com/Xinjie-Q/GaussianImage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08551v5-abstract-full').style.display = 'none'; document.getElementById('2403.08551v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024. Project Page:https://xingtongge.github.io/GaussianImage-page/ Code: https://github.com/Xinjie-Q/GaussianImage</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08505">arXiv:2403.08505</a> <span> [<a href="https://arxiv.org/pdf/2403.08505">pdf</a>, <a href="https://arxiv.org/format/2403.08505">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> CAMSIC: Content-aware Masked Image Modeling Transformer for Stereo Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+S">Shenyuan Gao</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Z">Zhening Liu</a>, <a href="/search/eess?searchtype=author&query=Shao%2C+J">Jiawei Shao</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+X">Xingtong Ge</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08505v5-abstract-short" style="display: inline;"> Existing learning-based stereo image codec adopt sophisticated transformation with simple entropy models derived from single image codecs to encode latent representations. However, those entropy models struggle to effectively capture the spatial-disparity characteristics inherent in stereo images, which leads to suboptimal rate-distortion results. In this paper, we propose a stereo image compressi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08505v5-abstract-full').style.display = 'inline'; document.getElementById('2403.08505v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08505v5-abstract-full" style="display: none;"> Existing learning-based stereo image codec adopt sophisticated transformation with simple entropy models derived from single image codecs to encode latent representations. However, those entropy models struggle to effectively capture the spatial-disparity characteristics inherent in stereo images, which leads to suboptimal rate-distortion results. In this paper, we propose a stereo image compression framework, named CAMSIC. CAMSIC independently transforms each image to latent representation and employs a powerful decoder-free Transformer entropy model to capture both spatial and disparity dependencies, by introducing a novel content-aware masked image modeling (MIM) technique. Our content-aware MIM facilitates efficient bidirectional interaction between prior information and estimated tokens, which naturally obviates the need for an extra Transformer decoder. Experiments show that our stereo image codec achieves state-of-the-art rate-distortion performance on two stereo image datasets Cityscapes and InStereo2K with fast encoding and decoding speed. Code is available at https://github.com/Xinjie-Q/CAMSIC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08505v5-abstract-full').style.display = 'none'; document.getElementById('2403.08505v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18152">arXiv:2402.18152</a> <span> [<a href="https://arxiv.org/pdf/2402.18152">pdf</a>, <a href="https://arxiv.org/format/2402.18152">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Boosting Neural Representations for Videos with a Conditional Decoder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+R">Ren Yang</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Ge%2C+X">Xingtong Ge</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+H">Hongwei Qin</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+J">Jun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18152v3-abstract-short" style="display: inline;"> Implicit neural representations (INRs) have emerged as a promising approach for video storage and processing, showing remarkable versatility across various video tasks. However, existing methods often fail to fully leverage their representation capabilities, primarily due to inadequate alignment of intermediate features during target frame decoding. This paper introduces a universal boosting frame… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18152v3-abstract-full').style.display = 'inline'; document.getElementById('2402.18152v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18152v3-abstract-full" style="display: none;"> Implicit neural representations (INRs) have emerged as a promising approach for video storage and processing, showing remarkable versatility across various video tasks. However, existing methods often fail to fully leverage their representation capabilities, primarily due to inadequate alignment of intermediate features during target frame decoding. This paper introduces a universal boosting framework for current implicit video representation approaches. Specifically, we utilize a conditional decoder with a temporal-aware affine transform module, which uses the frame index as a prior condition to effectively align intermediate features with target frames. Besides, we introduce a sinusoidal NeRV-like block to generate diverse intermediate features and achieve a more balanced parameter distribution, thereby enhancing the model's capacity. With a high-frequency information-preserving reconstruction loss, our approach successfully boosts multiple baseline INRs in the reconstruction quality and convergence speed for video regression, and exhibits superior inpainting and interpolation results. Further, we integrate a consistent entropy minimization technique and develop video codecs based on these boosted INRs. Experiments on the UVG dataset confirm that our enhanced codecs significantly outperform baseline INRs and offer competitive rate-distortion performance compared to traditional and learning-based codecs. Code is available at https://github.com/Xinjie-Q/Boosting-NeRV. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18152v3-abstract-full').style.display = 'none'; document.getElementById('2402.18152v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00535">arXiv:2402.00535</a> <span> [<a href="https://arxiv.org/pdf/2402.00535">pdf</a>, <a href="https://arxiv.org/ps/2402.00535">ps</a>, <a href="https://arxiv.org/format/2402.00535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Low-Cost Multi-Band Waveform Security Framework in Resource-Constrained Communications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongyang Xu</a>, <a href="/search/eess?searchtype=author&query=Wei%2C+Z">Zhongxiang Wei</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianhua Xu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+G">Gan Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00535v1-abstract-short" style="display: inline;"> Traditional physical layer secure beamforming is achieved via precoding before signal transmission using channel state information (CSI). However, imperfect CSI will compromise the performance with imperfect beamforming and potential information leakage. In addition, multiple RF chains and antennas are needed to support the narrow beam generation, which complicates hardware implementation and is n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00535v1-abstract-full').style.display = 'inline'; document.getElementById('2402.00535v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00535v1-abstract-full" style="display: none;"> Traditional physical layer secure beamforming is achieved via precoding before signal transmission using channel state information (CSI). However, imperfect CSI will compromise the performance with imperfect beamforming and potential information leakage. In addition, multiple RF chains and antennas are needed to support the narrow beam generation, which complicates hardware implementation and is not suitable for resource-constrained Internet-of-Things (IoT) devices. Moreover, with the advancement of hardware and artificial intelligence (AI), low-cost and intelligent eavesdropping to wireless communications is becoming increasingly detrimental. In this paper, we propose a multi-carrier based multi-band waveform-defined security (WDS) framework, independent from CSI and RF chains, to defend against AI eavesdropping. Ideally, the continuous variations of sub-band structures lead to an infinite number of spectral features, which can potentially prevent brute-force eavesdropping. Sub-band spectral pattern information is efficiently constructed at legitimate users via a proposed chaotic sequence generator. A novel security metric, termed signal classification accuracy (SCA), is used to evaluate the security robustness under AI eavesdropping. Communication error probability and complexity are also investigated to show the reliability and practical capability of the proposed framework. Finally, compared to traditional secure beamforming techniques, the proposed multi-band WDS framework reduces power consumption by up to six times. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00535v1-abstract-full').style.display = 'none'; document.getElementById('2402.00535v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10070">arXiv:2401.10070</a> <span> [<a href="https://arxiv.org/pdf/2401.10070">pdf</a>, <a href="https://arxiv.org/format/2401.10070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Communication-Efficient Personalized Federated Learning for Speech-to-Text Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Du%2C+Y">Yichao Du</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Z">Zhirui Zhang</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+L">Linan Yue</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+X">Xu Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yuqing Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tong Xu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+L">Linli Xu</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+E">Enhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10070v2-abstract-short" style="display: inline;"> To protect privacy and meet legal regulations, federated learning (FL) has gained significant attention for training speech-to-text (S2T) systems, including automatic speech recognition (ASR) and speech translation (ST). However, the commonly used FL approach (i.e., \textsc{FedAvg}) in S2T tasks typically suffers from extensive communication overhead due to multi-round interactions based on the wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10070v2-abstract-full').style.display = 'inline'; document.getElementById('2401.10070v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10070v2-abstract-full" style="display: none;"> To protect privacy and meet legal regulations, federated learning (FL) has gained significant attention for training speech-to-text (S2T) systems, including automatic speech recognition (ASR) and speech translation (ST). However, the commonly used FL approach (i.e., \textsc{FedAvg}) in S2T tasks typically suffers from extensive communication overhead due to multi-round interactions based on the whole model and performance degradation caused by data heterogeneity among clients.To address these issues, we propose a personalized federated S2T framework that introduces \textsc{FedLoRA}, a lightweight LoRA module for client-side tuning and interaction with the server to minimize communication overhead, and \textsc{FedMem}, a global model equipped with a $k$-nearest-neighbor ($k$NN) classifier that captures client-specific distributional shifts to achieve personalization and overcome data heterogeneity. Extensive experiments based on Conformer and Whisper backbone models on CoVoST and GigaSpeech benchmarks show that our approach significantly reduces the communication overhead on all S2T tasks and effectively personalizes the global model to overcome data heterogeneity. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10070v2-abstract-full').style.display = 'none'; document.getElementById('2401.10070v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08920">arXiv:2401.08920</a> <span> [<a href="https://arxiv.org/pdf/2401.08920">pdf</a>, <a href="https://arxiv.org/format/2401.08920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Idempotence and Perceptual Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Zhu%2C+Z">Ziran Zhu</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanghao Li</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+L">Lina Guo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+H">Hongwei Qin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jingjing Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Ya-Qin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08920v2-abstract-short" style="display: inline;"> Idempotence is the stability of image codec to re-compression. At the first glance, it is unrelated to perceptual image compression. However, we find that theoretically: 1) Conditional generative model-based perceptual codec satisfies idempotence; 2) Unconditional generative model with idempotence constraint is equivalent to conditional generative codec. Based on this newfound equivalence, we prop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08920v2-abstract-full').style.display = 'inline'; document.getElementById('2401.08920v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08920v2-abstract-full" style="display: none;"> Idempotence is the stability of image codec to re-compression. At the first glance, it is unrelated to perceptual image compression. However, we find that theoretically: 1) Conditional generative model-based perceptual codec satisfies idempotence; 2) Unconditional generative model with idempotence constraint is equivalent to conditional generative codec. Based on this newfound equivalence, we propose a new paradigm of perceptual image codec by inverting unconditional generative model with idempotence constraints. Our codec is theoretically equivalent to conditional generative codec, and it does not require training new models. Instead, it only requires a pre-trained mean-square-error codec and unconditional generative model. Empirically, we show that our proposed approach outperforms state-of-the-art methods such as HiFiC and ILLM, in terms of Fr茅chet Inception Distance (FID). The source code is provided in https://github.com/tongdaxu/Idempotence-and-Perceptual-Image-Compression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08920v2-abstract-full').style.display = 'none'; document.getElementById('2401.08920v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04579">arXiv:2401.04579</a> <span> [<a href="https://arxiv.org/pdf/2401.04579">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Deep Network for Explainable Prediction of Non-Imaging Phenotypes using Anatomical Multi-View Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Wei%2C+Y">Yuxiang Wei</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+Y">Yuqian Chen</a>, <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tengfei Xue</a>, <a href="/search/eess?searchtype=author&query=Zekelman%2C+L">Leo Zekelman</a>, <a href="/search/eess?searchtype=author&query=Makris%2C+N">Nikos Makris</a>, <a href="/search/eess?searchtype=author&query=Rathi%2C+Y">Yogesh Rathi</a>, <a href="/search/eess?searchtype=author&query=Cai%2C+W">Weidong Cai</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/eess?searchtype=author&query=Donnell%2C+L+J+O">Lauren J. O' Donnell</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04579v2-abstract-short" style="display: inline;"> Large datasets often contain multiple distinct feature sets, or views, that offer complementary information that can be exploited by multi-view learning methods to improve results. We investigate anatomical multi-view data, where each brain anatomical structure is described with multiple feature sets. In particular, we focus on sets of white matter microstructure and connectivity features from dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04579v2-abstract-full').style.display = 'inline'; document.getElementById('2401.04579v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04579v2-abstract-full" style="display: none;"> Large datasets often contain multiple distinct feature sets, or views, that offer complementary information that can be exploited by multi-view learning methods to improve results. We investigate anatomical multi-view data, where each brain anatomical structure is described with multiple feature sets. In particular, we focus on sets of white matter microstructure and connectivity features from diffusion MRI, as well as sets of gray matter area and thickness features from structural MRI. We investigate machine learning methodology that applies multi-view approaches to improve the prediction of non-imaging phenotypes, including demographics (age), motor (strength), and cognition (picture vocabulary). We present an explainable multi-view network (EMV-Net) that can use different anatomical views to improve prediction performance. In this network, each individual anatomical view is processed by a view-specific feature extractor and the extracted information from each view is fused using a learnable weight. This is followed by a wavelet transform-based module to obtain complementary information across views which is then applied to calibrate the view-specific information. Additionally, the calibrator produces an attention-based calibration score to indicate anatomical structures' importance for interpretation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04579v2-abstract-full').style.display = 'none'; document.getElementById('2401.04579v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2023 The Medical Image Computing and Computer Assisted Intervention Society workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03363">arXiv:2401.03363</a> <span> [<a href="https://arxiv.org/pdf/2401.03363">pdf</a>, <a href="https://arxiv.org/format/2401.03363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TAC.2024.3417088">10.1109/TAC.2024.3417088 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Data-driven Dynamic Event-triggered Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Z">Zhiyong Sun</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+G">Guanghui Wen</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhisheng Duan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03363v1-abstract-short" style="display: inline;"> This paper revisits the event-triggered control problem from a data-driven perspective, where unknown continuous-time linear systems subject to disturbances are taken into account. Using data information collected off-line instead of accurate system model information, a data-driven dynamic event-triggered control scheme is developed in this paper. The dynamic property is reflected by that the desi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03363v1-abstract-full').style.display = 'inline'; document.getElementById('2401.03363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03363v1-abstract-full" style="display: none;"> This paper revisits the event-triggered control problem from a data-driven perspective, where unknown continuous-time linear systems subject to disturbances are taken into account. Using data information collected off-line instead of accurate system model information, a data-driven dynamic event-triggered control scheme is developed in this paper. The dynamic property is reflected by that the designed event-triggering function embedded in the event-triggering mechanism (ETM) is dynamically updated as a whole. Thanks to this dynamic design, a strictly positive minimum inter-event time (MIET) is guaranteed without sacrificing control performance. Specifically, exponential input-to-state stability (ISS) of the closed-loop system with respect to disturbances is achieved in this paper, which is superior to some existing results that only guarantee a practical exponential ISS property. The dynamic ETM is easy-to-implement in practical operation since all designed parameters are determined only by a simple data-driven linear matrix inequality (LMI), without additional complicated conditions as required in relevant literature. As quantization is the most common signal constraint in practice, the developed control scheme is further extended to the case where state transmission is affected by a uniform or logarithmic quantization effect. Finally, adequate simulations are performed to show the validity and superiority of the proposed control schemes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03363v1-abstract-full').style.display = 'none'; document.getElementById('2401.03363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Automatic Control, vol. 69, no. 12, pp. 8804-8811, Dec. 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.01061">arXiv:2312.01061</a> <span> [<a href="https://arxiv.org/pdf/2312.01061">pdf</a>, <a href="https://arxiv.org/format/2312.01061">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2023.3318366">10.1109/TCSVT.2023.3318366 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Spectral-wise Implicit Neural Representation for Hyperspectral Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Chen%2C+H">Huan Chen</a>, <a href="/search/eess?searchtype=author&query=Zhao%2C+W">Wangcai Zhao</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tingfa Xu</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+S">Shiyun Zhou</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+P">Peifu Liu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+J">Jianan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.01061v1-abstract-short" style="display: inline;"> Coded Aperture Snapshot Spectral Imaging (CASSI) reconstruction aims to recover the 3D spatial-spectral signal from 2D measurement. Existing methods for reconstructing Hyperspectral Image (HSI) typically involve learning mappings from a 2D compressed image to a predetermined set of discrete spectral bands. However, this approach overlooks the inherent continuity of the spectral information. In thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01061v1-abstract-full').style.display = 'inline'; document.getElementById('2312.01061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.01061v1-abstract-full" style="display: none;"> Coded Aperture Snapshot Spectral Imaging (CASSI) reconstruction aims to recover the 3D spatial-spectral signal from 2D measurement. Existing methods for reconstructing Hyperspectral Image (HSI) typically involve learning mappings from a 2D compressed image to a predetermined set of discrete spectral bands. However, this approach overlooks the inherent continuity of the spectral information. In this study, we propose an innovative method called Spectral-wise Implicit Neural Representation (SINR) as a pioneering step toward addressing this limitation. SINR introduces a continuous spectral amplification process for HSI reconstruction, enabling spectral super-resolution with customizable magnification factors. To achieve this, we leverage the concept of implicit neural representation. Specifically, our approach introduces a spectral-wise attention mechanism that treats individual channels as distinct tokens, thereby capturing global spectral dependencies. Additionally, our approach incorporates two components, namely a Fourier coordinate encoder and a spectral scale factor module. The Fourier coordinate encoder enhances the SINR's ability to emphasize high-frequency components, while the spectral scale factor module guides the SINR to adapt to the variable number of spectral channels. Notably, the SINR framework enhances the flexibility of CASSI reconstruction by accommodating an unlimited number of spectral bands in the desired output. Extensive experiments demonstrate that our SINR outperforms baseline methods. By enabling continuous reconstruction within the CASSI framework, we take the initial stride toward integrating implicit neural representation into the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.01061v1-abstract-full').style.display = 'none'; document.getElementById('2312.01061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Circuits and Systems for Video Technology, to be published</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.15164">arXiv:2311.15164</a> <span> [<a href="https://arxiv.org/pdf/2311.15164">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1002/lpor.202400187">10.1002/lpor.202400187 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Neural-Optic Co-Designed Polarization-Multiplexed Metalens for Compact Computational Spectral Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qiangbo Zhang</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+P">Peicheng Lin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+C">Chang Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Yang Zhang</a>, <a href="/search/eess?searchtype=author&query=Yu%2C+Z">Zeqing Yu</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+X">Xinyu Liu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Ting Xu</a>, <a href="/search/eess?searchtype=author&query=Zheng%2C+Z">Zhenrong Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.15164v1-abstract-short" style="display: inline;"> As the realm of spectral imaging applications extends its reach into the domains of mobile technology and augmented reality, the demands for compact yet high-fidelity systems become increasingly pronounced. Conventional methodologies, exemplified by coded aperture snapshot spectral imaging systems, are significantly limited by their cumbersome physical dimensions and form factors. To address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15164v1-abstract-full').style.display = 'inline'; document.getElementById('2311.15164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.15164v1-abstract-full" style="display: none;"> As the realm of spectral imaging applications extends its reach into the domains of mobile technology and augmented reality, the demands for compact yet high-fidelity systems become increasingly pronounced. Conventional methodologies, exemplified by coded aperture snapshot spectral imaging systems, are significantly limited by their cumbersome physical dimensions and form factors. To address this inherent challenge, diffractive optical elements (DOEs) have been repeatedly employed as a means to mitigate issues related to the bulky nature of these systems. Nonetheless, it's essential to note that the capabilities of DOEs primarily revolve around the modulation of the phase of light. Here, we introduce an end-to-end computational spectral imaging framework based on a polarization-multiplexed metalens. A distinguishing feature of this approach lies in its capacity to simultaneously modulate orthogonal polarization channels. When harnessed in conjunction with a neural network, it facilitates the attainment of high-fidelity spectral reconstruction. Importantly, the framework is intrinsically fully differentiable, a feature that permits the joint optimization of both the metalens structure and the parameters governing the neural network. The experimental results presented herein validate the exceptional spatial-spectral reconstruction performance, underscoring the efficacy of this system in practical, real-world scenarios. This innovative approach transcends the traditional boundaries separating hardware and software in the realm of computational imaging and holds the promise of substantially propelling the miniaturization of spectral imaging systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.15164v1-abstract-full').style.display = 'none'; document.getElementById('2311.15164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.13371">arXiv:2311.13371</a> <span> [<a href="https://arxiv.org/pdf/2311.13371">pdf</a>, <a href="https://arxiv.org/format/2311.13371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Novel Dynamic Event-triggered Mechanism for Dynamic Average Consensus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/eess?searchtype=author&query=Duan%2C+Z">Zhisheng Duan</a>, <a href="/search/eess?searchtype=author&query=Wen%2C+G">Guanghui Wen</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+Z">Zhiyong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.13371v1-abstract-short" style="display: inline;"> This paper studies a challenging issue introduced in a recent survey, namely designing a distributed event-based scheme to solve the dynamic average consensus (DAC) problem. First, a robust adaptive distributed event-based DAC algorithm is designed without imposing specific initialization criteria to perform estimation task under intermittent communication. Second, a novel adaptive distributed dyn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13371v1-abstract-full').style.display = 'inline'; document.getElementById('2311.13371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.13371v1-abstract-full" style="display: none;"> This paper studies a challenging issue introduced in a recent survey, namely designing a distributed event-based scheme to solve the dynamic average consensus (DAC) problem. First, a robust adaptive distributed event-based DAC algorithm is designed without imposing specific initialization criteria to perform estimation task under intermittent communication. Second, a novel adaptive distributed dynamic event-triggered mechanism is proposed to determine the triggering time when neighboring agents broadcast information to each other. Compared to the existing event-triggered mechanisms, the novelty of the proposed dynamic event-triggered mechanism lies in that it guarantees the existence of a positive and uniform minimum inter-event interval without sacrificing any accuracy of the estimation, which is much more practical than only ensuring the exclusion of the Zeno behavior or the boundedness of the estimation error. Third, a composite adaptive law is developed to update the adaptive gain employed in the distributed event-based DAC algorithm and dynamic event-triggered mechanism. Using the composite adaptive update law, the distributed event-based solution proposed in our work is implemented without requiring any global information. Finally, numerical simulations are provided to illustrate the effectiveness of the theoretical results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13371v1-abstract-full').style.display = 'none'; document.getElementById('2311.13371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.20289">arXiv:2310.20289</a> <span> [<a href="https://arxiv.org/pdf/2310.20289">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Optics">physics.optics</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> </div> </div> <p class="title is-5 mathjax"> C-Silicon-based metasurfaces for aperture-robust spectrometer/imaging with angle integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+W">Weizhu Xu</a>, <a href="/search/eess?searchtype=author&query=Fan%2C+Q">Qingbin Fan</a>, <a href="/search/eess?searchtype=author&query=Lin%2C+P">Peicheng Lin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Jiarong Wang</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+H">Hao Hu</a>, <a href="/search/eess?searchtype=author&query=Yue%2C+T">Tao Yue</a>, <a href="/search/eess?searchtype=author&query=Hu%2C+X">Xuemei Hu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Ting Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.20289v1-abstract-short" style="display: inline;"> Compared with conventional grating-based spectrometers, reconstructive spectrometers based on spectrally engineered filtering have the advantage of miniaturization because of the less demand for dispersive optics and free propagation space. However, available reconstructive spectrometers fail to balance the performance on operational bandwidth, spectral diversity and angular stability. In this wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.20289v1-abstract-full').style.display = 'inline'; document.getElementById('2310.20289v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.20289v1-abstract-full" style="display: none;"> Compared with conventional grating-based spectrometers, reconstructive spectrometers based on spectrally engineered filtering have the advantage of miniaturization because of the less demand for dispersive optics and free propagation space. However, available reconstructive spectrometers fail to balance the performance on operational bandwidth, spectral diversity and angular stability. In this work, we proposed a compact silicon metasurfaces based spectrometer/camera. After angle integration, the spectral response of the system is robust to angle/aperture within a wide working bandwidth from 400nm to 800nm. It is experimentally demonstrated that the proposed method could maintain the spectral consistency from F/1.8 to F/4 (The corresponding angle of incident light ranges from 7掳 to 16掳) and the incident hyperspectral signal could be accurately reconstructed with a fidelity exceeding 99%. Additionally, a spectral imaging system with 400x400 pixels is also established in this work. The accurate reconstructed hyperspectral image indicates that the proposed aperture-robust spectrometer has the potential to be extended as a high-resolution broadband hyperspectral camera. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.20289v1-abstract-full').style.display = 'none'; document.getElementById('2310.20289v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08251">arXiv:2310.08251</a> <span> [<a href="https://arxiv.org/pdf/2310.08251">pdf</a>, <a href="https://arxiv.org/format/2310.08251">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.3390/jmse12122356">10.3390/jmse12122356 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Underwater Sound Speed Profile Construction: A Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Zhou%2C+J">Jixuan Zhou</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Fan Gao</a>, <a href="/search/eess?searchtype=author&query=Lu%2C+J">Jiajun Lu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+S">Sijia Li</a>, <a href="/search/eess?searchtype=author&query=Wu%2C+P">Pengfei Wu</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+J">Junting Wang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08251v1-abstract-short" style="display: inline;"> Real--time and accurate construction of regional sound speed profiles (SSP) is important for building underwater positioning, navigation, and timing (PNT) systems as it greatly affect the signal propagation modes such as trajectory. In this paper, we summarizes and analyzes the current research status in the field of underwater SSP construction, and the mainstream methods include direct SSP measur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08251v1-abstract-full').style.display = 'inline'; document.getElementById('2310.08251v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08251v1-abstract-full" style="display: none;"> Real--time and accurate construction of regional sound speed profiles (SSP) is important for building underwater positioning, navigation, and timing (PNT) systems as it greatly affect the signal propagation modes such as trajectory. In this paper, we summarizes and analyzes the current research status in the field of underwater SSP construction, and the mainstream methods include direct SSP measurement and SSP inversion. In the direct measurement method, we compare the performance of popular international commercial temperature, conductivity, and depth profilers (CTD). While for the inversion methods, the framework and basic principles of matched field processing (MFP), compressive sensing (CS), and deep learning (DL) for constructing SSP are introduced, and their advantages and disadvantages are compared. The traditional direct measurement method has good accuracy performance, but it usually takes a long time. The proposal of SSP inversion method greatly improves the convenience and real--time performance, but the accuracy is not as good as the direct measurement method. Currently, the SSP inversion relies on sonar observation data, making it difficult to apply to areas that couldn't be covered by underwater observation systems, and these methods are unable to predict the distribution of sound velocity at future times. How to comprehensively utilize multi-source data and provide elastic sound velocity distribution estimation services with different accuracy and real-time requirements for underwater users without sonar observation data is the mainstream trend in future research on SSP construction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08251v1-abstract-full').style.display = 'none'; document.getElementById('2310.08251v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Journal of Marine Science and Engineering 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.08201">arXiv:2310.08201</a> <span> [<a href="https://arxiv.org/pdf/2310.08201">pdf</a>, <a href="https://arxiv.org/format/2310.08201">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Fast Ray-Tracing-Based Precise Underwater Acoustic Localization without Prior Acknowledgment of Target Depth </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+W">Wei Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/eess?searchtype=author&query=Meng%2C+K">Kaitao Meng</a>, <a href="/search/eess?searchtype=author&query=Gao%2C+F">Fan Gao</a>, <a href="/search/eess?searchtype=author&query=Sun%2C+W">Wenzhou Sun</a>, <a href="/search/eess?searchtype=author&query=Shu%2C+J">Jianxu Shu</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianhe Xu</a>, <a href="/search/eess?searchtype=author&query=Li%2C+D">Deshi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.08201v1-abstract-short" style="display: inline;"> Underwater localization is of great importance for marine observation and building positioning, navigation, timing (PNT) systems that could be widely applied in disaster warning, underwater rescues and resources exploration. The uneven distribution of underwater sound velocity poses great challenge for precise underwater positioning. The current soundline correction positioning method mainly aims… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08201v1-abstract-full').style.display = 'inline'; document.getElementById('2310.08201v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.08201v1-abstract-full" style="display: none;"> Underwater localization is of great importance for marine observation and building positioning, navigation, timing (PNT) systems that could be widely applied in disaster warning, underwater rescues and resources exploration. The uneven distribution of underwater sound velocity poses great challenge for precise underwater positioning. The current soundline correction positioning method mainly aims at scenarios with known target depth. However, for nodes that are non-cooperative nodes or lack of depth information, soundline tracking strategies cannot work well due to nonunique positional solutions. To tackle this issue, we propose an iterative ray tracing 3D underwater localization (IRTUL) method for stratification compensation. To demonstrate the feasibility of fast stratification compensation, we first derive the signal path as a function of glancing angle, and then prove that the signal propagation time and horizontal propagation distance are monotonic functions of the initial grazing angle, so that fast ray tracing can be achieved. Then, we propose an sound velocity profile (SVP) simplification method, which reduces the computational cost of ray tracing. Experimental results show that the IRTUL has the most significant distance correction in the depth direction, and the average accuracy of IRTUL has been improved by about 3 meters compared to localization model with constant sound velocity. Also, the simplified SVP can significantly improve real-time performance with average accuracy loss less than 0.2 m when used for positioning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.08201v1-abstract-full').style.display = 'none'; document.getElementById('2310.08201v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.04657">arXiv:2310.04657</a> <span> [<a href="https://arxiv.org/pdf/2310.04657">pdf</a>, <a href="https://arxiv.org/format/2310.04657">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Spike-Triggered Contextual Biasing for End-to-End Mandarin Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+B">Binbin Zhang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+X">Xingchen Song</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.04657v1-abstract-short" style="display: inline;"> The attention-based deep contextual biasing method has been demonstrated to effectively improve the recognition performance of end-to-end automatic speech recognition (ASR) systems on given contextual phrases. However, unlike shallow fusion methods that directly bias the posterior of the ASR model, deep biasing methods implicitly integrate contextual information, making it challenging to control t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04657v1-abstract-full').style.display = 'inline'; document.getElementById('2310.04657v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.04657v1-abstract-full" style="display: none;"> The attention-based deep contextual biasing method has been demonstrated to effectively improve the recognition performance of end-to-end automatic speech recognition (ASR) systems on given contextual phrases. However, unlike shallow fusion methods that directly bias the posterior of the ASR model, deep biasing methods implicitly integrate contextual information, making it challenging to control the degree of bias. In this study, we introduce a spike-triggered deep biasing method that simultaneously supports both explicit and implicit bias. Moreover, both bias approaches exhibit significant improvements and can be cascaded with shallow fusion methods for better results. Furthermore, we propose a context sampling enhancement strategy and improve the contextual phrase filtering algorithm. Experiments on the public WenetSpeech Mandarin biased-word dataset show a 32.0% relative CER reduction compared to the baseline model, with an impressively 68.6% relative CER reduction on contextual phrases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.04657v1-abstract-full').style.display = 'none'; document.getElementById('2310.04657v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ASRU2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.02855">arXiv:2309.02855</a> <span> [<a href="https://arxiv.org/pdf/2309.02855">pdf</a>, <a href="https://arxiv.org/format/2309.02855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Bandwidth-efficient Inference for Neural Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Yin%2C+S">Shanzhi Yin</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Liang%2C+Y">Yongsheng Liang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanghao Li</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jingjing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.02855v2-abstract-short" style="display: inline;"> With neural networks growing deeper and feature maps growing larger, limited communication bandwidth with external memory (or DRAM) and power constraints become a bottleneck in implementing network inference on mobile and edge devices. In this paper, we propose an end-to-end differentiable bandwidth efficient neural inference method with the activation compressed by neural data compression method.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02855v2-abstract-full').style.display = 'inline'; document.getElementById('2309.02855v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.02855v2-abstract-full" style="display: none;"> With neural networks growing deeper and feature maps growing larger, limited communication bandwidth with external memory (or DRAM) and power constraints become a bottleneck in implementing network inference on mobile and edge devices. In this paper, we propose an end-to-end differentiable bandwidth efficient neural inference method with the activation compressed by neural data compression method. Specifically, we propose a transform-quantization-entropy coding pipeline for activation compression with symmetric exponential Golomb coding and a data-dependent Gaussian entropy model for arithmetic coding. Optimized with existing model quantization methods, low-level task of image compression can achieve up to 19x bandwidth reduction with 6.21x energy saving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.02855v2-abstract-full').style.display = 'none'; document.getElementById('2309.02855v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures, submitted to ICASSP 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U10(primary); 94A08 68T07(secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.4.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13287">arXiv:2308.13287</a> <span> [<a href="https://arxiv.org/pdf/2308.13287">pdf</a>, <a href="https://arxiv.org/format/2308.13287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Learned Lossless JPEG Recompression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Guo%2C+L">Lina Guo</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Luo%2C+J">Jixiang Luo</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Ji%2C+Z">Zhenjun Ji</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+S">Shanshan Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+H">Hongwei Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13287v1-abstract-short" style="display: inline;"> JPEG is one of the most popular image compression methods. It is beneficial to compress those existing JPEG files without introducing additional distortion. In this paper, we propose a deep learning based method to further compress JPEG images losslessly. Specifically, we propose a Multi-Level Parallel Conditional Modeling (ML-PCM) architecture, which enables parallel decoding in different granula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13287v1-abstract-full').style.display = 'inline'; document.getElementById('2308.13287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13287v1-abstract-full" style="display: none;"> JPEG is one of the most popular image compression methods. It is beneficial to compress those existing JPEG files without introducing additional distortion. In this paper, we propose a deep learning based method to further compress JPEG images losslessly. Specifically, we propose a Multi-Level Parallel Conditional Modeling (ML-PCM) architecture, which enables parallel decoding in different granularities. First, luma and chroma are processed independently to allow parallel coding. Second, we propose pipeline parallel context model (PPCM) and compressed checkerboard context model (CCCM) for the effective conditional modeling and efficient decoding within luma and chroma components. Our method has much lower latency while achieves better compression ratio compared with previous SOTA. After proper software optimization, we can obtain a good throughput of 57 FPS for 1080P images on NVIDIA T4 GPU. Furthermore, combined with quantization, our approach can also act as a lossy JPEG codec which has obvious advantage over SOTA lossy compression methods in high bit rate (bpp$>0.9$). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13287v1-abstract-full').style.display = 'none'; document.getElementById('2308.13287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.08154">arXiv:2308.08154</a> <span> [<a href="https://arxiv.org/pdf/2308.08154">pdf</a>, <a href="https://arxiv.org/format/2308.08154">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Conditional Perceptual Quality Preserving Image Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tongda Xu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Q">Qian Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+Y">Yanghao Li</a>, <a href="/search/eess?searchtype=author&query=He%2C+D">Dailan He</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yuanyuan Wang</a>, <a href="/search/eess?searchtype=author&query=Qin%2C+H">Hongwei Qin</a>, <a href="/search/eess?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+J">Jingjing Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+Y">Ya-Qin Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.08154v1-abstract-short" style="display: inline;"> We propose conditional perceptual quality, an extension of the perceptual quality defined in \citet{blau2018perception}, by conditioning it on user defined information. Specifically, we extend the original perceptual quality $d(p_{X},p_{\hat{X}})$ to the conditional perceptual quality $d(p_{X|Y},p_{\hat{X}|Y})$, where $X$ is the original image, $\hat{X}$ is the reconstructed, $Y$ is side informati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08154v1-abstract-full').style.display = 'inline'; document.getElementById('2308.08154v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.08154v1-abstract-full" style="display: none;"> We propose conditional perceptual quality, an extension of the perceptual quality defined in \citet{blau2018perception}, by conditioning it on user defined information. Specifically, we extend the original perceptual quality $d(p_{X},p_{\hat{X}})$ to the conditional perceptual quality $d(p_{X|Y},p_{\hat{X}|Y})$, where $X$ is the original image, $\hat{X}$ is the reconstructed, $Y$ is side information defined by user and $d(.,.)$ is divergence. We show that conditional perceptual quality has similar theoretical properties as rate-distortion-perception trade-off \citep{blau2019rethinking}. Based on these theoretical results, we propose an optimal framework for conditional perceptual quality preserving compression. Experimental results show that our codec successfully maintains high perceptual quality and semantic quality at all bitrate. Besides, by providing a lowerbound of common randomness required, we settle the previous arguments on whether randomness should be incorporated into generator for (conditional) perceptual quality compression. The source code is provided in supplementary material. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.08154v1-abstract-full').style.display = 'none'; document.getElementById('2308.08154v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.03998">arXiv:2307.03998</a> <span> [<a href="https://arxiv.org/pdf/2307.03998">pdf</a>, <a href="https://arxiv.org/format/2307.03998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Lightweight Improved Residual Network for Efficient Inverse Tone Mapping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+L">Liqi Xue</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&query=Song%2C+Y">Yongbao Song</a>, <a href="/search/eess?searchtype=author&query=Liu%2C+Y">Yan Liu</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/eess?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/eess?searchtype=author&query=Xu%2C+J">Jun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.03998v2-abstract-short" style="display: inline;"> The display devices like HDR10 televisions are increasingly prevalent in our daily life for visualizing high dynamic range (HDR) images. But the majority of media images on the internet remain in 8-bit standard dynamic range (SDR) format. Therefore, converting SDR images to HDR ones by inverse tone mapping (ITM) is crucial to unlock the full potential of abundant media images. However, existing IT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03998v2-abstract-full').style.display = 'inline'; document.getElementById('2307.03998v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.03998v2-abstract-full" style="display: none;"> The display devices like HDR10 televisions are increasingly prevalent in our daily life for visualizing high dynamic range (HDR) images. But the majority of media images on the internet remain in 8-bit standard dynamic range (SDR) format. Therefore, converting SDR images to HDR ones by inverse tone mapping (ITM) is crucial to unlock the full potential of abundant media images. However, existing ITM methods are usually developed with complex network architectures requiring huge computational costs. In this paper, we propose a lightweight Improved Residual Network (IRNet) by enhancing the power of popular residual block for efficient ITM. Specifically, we propose a new Improved Residual Block (IRB) to extract and fuse multi-layer features for fine-grained HDR image reconstruction. Experiments on three benchmark datasets demonstrate that our IRNet achieves state-of-the-art performance on both the ITM and joint SR-ITM tasks. The code, models and data will be publicly available at https://github.com/ThisisVikki/ITM-baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03998v2-abstract-full').style.display = 'none'; document.getElementById('2307.03998v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.03921">arXiv:2307.03921</a> <span> [<a href="https://arxiv.org/pdf/2307.03921">pdf</a>, <a href="https://arxiv.org/format/2307.03921">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Social-Mobility-Aware Joint Communication and Computation Resource Management in NOMA-Enabled Vehicular Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xue%2C+T">Tong Xue</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+H">Haixia Zhang</a>, <a href="/search/eess?searchtype=author&query=Ding%2C+H">Hui Ding</a>, <a href="/search/eess?searchtype=author&query=Yuan%2C+D">Dongfeng Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.03921v1-abstract-short" style="display: inline;"> The existing computation and communication (2C) optimization schemes for vehicular edge computing (VEC) networks mainly focus on the physical domain without considering the influence from the social domain. This may greatly limit the potential of task offloading, making it difficult to fully boom the task offloading rate with given power, resulting in low energy efficiency (EE). To address the iss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03921v1-abstract-full').style.display = 'inline'; document.getElementById('2307.03921v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.03921v1-abstract-full" style="display: none;"> The existing computation and communication (2C) optimization schemes for vehicular edge computing (VEC) networks mainly focus on the physical domain without considering the influence from the social domain. This may greatly limit the potential of task offloading, making it difficult to fully boom the task offloading rate with given power, resulting in low energy efficiency (EE). To address the issue, this letter devotes itself to investigate social-mobility-aware VEC framework and proposes a novel EE-oriented 2C assignment scheme. In doing so, we assume that the task vehicular user (T-VU) can offload computation tasks to the service vehicular user (S-VU) and the road side unit (RSU) by non-orthogonal multiple access (NOMA). An optimization problem is formulated to jointly assign the 2C resources to maximize the system EE, which turns out to be a mixed integer non-convex objective function. To solve the problem, we transform it into separated computation and communication resource allocation subproblems. Dealing with the first subproblem, we propose a social-mobility-aware edge server selection and task splitting algorithm (SM-SSTSA) to achieve edge server selection and task splitting. Then, by solving the second subproblem, the power allocation and spectrum assignment solutions are obtained utilizing a tightening lower bound method and a Kuhn-Munkres algorithm. Finally, we solve the original problem through an iterative method. Simulation results demonstrate the superior EE performance of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.03921v1-abstract-full').style.display = 'none'; document.getElementById('2307.03921v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.00804">arXiv:2306.00804</a> <span> [<a href="https://arxiv.org/pdf/2306.00804">pdf</a>, <a href="https://arxiv.org/format/2306.00804">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Contextual Biasing for Transducer Based Streaming Speech Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/eess?searchtype=author&query=Xu%2C+T">Tianyi Xu</a>, <a href="/search/eess?searchtype=author&query=Yang%2C+Z">Zhanheng Yang</a>, <a href="/search/eess?searchtype=author&query=Huang%2C+K">Kaixun Huang</a>, <a href="/search/eess?searchtype=author&query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/eess?searchtype=author&query=Zhang%2C+A">Ao Zhang</a>, <a href="/search/eess?searchtype=author&query=Li%2C+B">Biao Li</a>, <a href="/search/eess?searchtype=author&query=Chen%2C+C">Changru Chen</a>, <a href="/search/eess?searchtype=author&query=Li%2C+C">Chao Li</a>, <a href="/search/eess?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.00804v3-abstract-short" style="display: inline;"> By incorporating additional contextual information, deep biasing methods have emerged as a promising solution for speech recognition of personalized words. However, for real-world voice assistants, always biasing on such personalized words with high prediction scores can significantly degrade the performance of recognizing common words. To address this issue, we propose an adaptive contextual bias… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00804v3-abstract-full').style.display = 'inline'; document.getElementById('2306.00804v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.00804v3-abstract-full" style="display: none;"> By incorporating additional contextual information, deep biasing methods have emerged as a promising solution for speech recognition of personalized words. However, for real-world voice assistants, always biasing on such personalized words with high prediction scores can significantly degrade the performance of recognizing common words. To address this issue, we propose an adaptive contextual biasing method based on Context-Aware Transformer Transducer (CATT) that utilizes the biased encoder and predictor embeddings to perform streaming prediction of contextual phrase occurrences. Such prediction is then used to dynamically switch the bias list on and off, enabling the model to adapt to both personalized and common scenarios. Experiments on Librispeech and internal voice assistant datasets show that our approach can achieve up to 6.7% and 20.7% relative reduction in WER and CER compared to the baseline respectively, mitigating up to 96.7% and 84.9% of the relative WER and CER increase for common cases. Furthermore, our approach has a minimal performance impact in personalized scenarios while maintaining a streaming inference pipeline with negligible RTF increase. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.00804v3-abstract-full').style.display = 'none'; document.getElementById('2306.00804v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Xue%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Xue%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Xue%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository