Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 491 results for author: <span class="mathjax">Du, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/" aria-role="search"> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Du, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Du%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Du, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Du%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18010">arXiv:2411.18010</a> <span> [<a href="https://arxiv.org/pdf/2411.18010">pdf</a>, <a href="https://arxiv.org/format/2411.18010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> JPPO: Joint Power and Prompt Optimization for Accelerated Large Language Model Services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=You%2C+F">Feiran You</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Huang%2C+K">Kaibin Huang</a>, <a href="/search/?searchtype=author&query=Jamalipour%2C+A">Abbas Jamalipour</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18010v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable capabilities in various tasks, leading to their increasing deployment in wireless networks for a wide variety of user services. However, the growing longer prompt setting highlights the crucial issue of computational resource demands and huge communication load. To address this challenge, we propose Joint Power and Prompt Optimization (JPPO… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18010v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18010v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable capabilities in various tasks, leading to their increasing deployment in wireless networks for a wide variety of user services. However, the growing longer prompt setting highlights the crucial issue of computational resource demands and huge communication load. To address this challenge, we propose Joint Power and Prompt Optimization (JPPO), a framework that combines Small Language Model (SLM)-based prompt compression with wireless power allocation optimization. By deploying SLM at user devices for prompt compression and employing Deep Reinforcement Learning for joint optimization of compression ratio and transmission power, JPPO effectively balances service quality with resource efficiency. Experimental results demonstrate that our framework achieves high service fidelity and low bit error rates while optimizing power usage in wireless LLM services. The system reduces response time by about 17%, with the improvement varying based on the length of the original prompt. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18010v1-abstract-full').style.display = 'none'; document.getElementById('2411.18010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16729">arXiv:2411.16729</a> <span> [<a href="https://arxiv.org/pdf/2411.16729">pdf</a>, <a href="https://arxiv.org/format/2411.16729">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> DiM-Gestor: Co-Speech Gesture Generation with Adaptive Layer Normalization Mamba-2 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/?searchtype=author&query=Zhao%2C+S">Siyuan Zhao</a>, <a href="/search/?searchtype=author&query=Ji%2C+N">Naye Ji</a>, <a href="/search/?searchtype=author&query=Wang%2C+Z">Zhaohan Wang</a>, <a href="/search/?searchtype=author&query=Wu%2C+J">Jingmei Wu</a>, <a href="/search/?searchtype=author&query=Gao%2C+F">Fuxing Gao</a>, <a href="/search/?searchtype=author&query=Ye%2C+Z">Zhenqing Ye</a>, <a href="/search/?searchtype=author&query=Yan%2C+L">Leyao Yan</a>, <a href="/search/?searchtype=author&query=Dai%2C+L">Lanxin Dai</a>, <a href="/search/?searchtype=author&query=Geng%2C+W">Weidong Geng</a>, <a href="/search/?searchtype=author&query=Lyu%2C+X">Xin Lyu</a>, <a href="/search/?searchtype=author&query=Zhao%2C+B">Bozuo Zhao</a>, <a href="/search/?searchtype=author&query=Yu%2C+D">Dingguo Yu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui Du</a>, <a href="/search/?searchtype=author&query=Hu%2C+B">Bin Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16729v1-abstract-short" style="display: inline;"> Speech-driven gesture generation using transformer-based generative models represents a rapidly advancing area within virtual human creation. However, existing models face significant challenges due to their quadratic time and space complexities, limiting scalability and efficiency. To address these limitations, we introduce DiM-Gestor, an innovative end-to-end generative model leveraging the Mamb… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16729v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16729v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16729v1-abstract-full" style="display: none;"> Speech-driven gesture generation using transformer-based generative models represents a rapidly advancing area within virtual human creation. However, existing models face significant challenges due to their quadratic time and space complexities, limiting scalability and efficiency. To address these limitations, we introduce DiM-Gestor, an innovative end-to-end generative model leveraging the Mamba-2 architecture. DiM-Gestor features a dual-component framework: (1) a fuzzy feature extractor and (2) a speech-to-gesture mapping module, both built on the Mamba-2. The fuzzy feature extractor, integrated with a Chinese Pre-trained Model and Mamba-2, autonomously extracts implicit, continuous speech features. These features are synthesized into a unified latent representation and then processed by the speech-to-gesture mapping module. This module employs an Adaptive Layer Normalization (AdaLN)-enhanced Mamba-2 mechanism to uniformly apply transformations across all sequence tokens. This enables precise modeling of the nuanced interplay between speech features and gesture dynamics. We utilize a diffusion model to train and infer diverse gesture outputs. Extensive subjective and objective evaluations conducted on the newly released Chinese Co-Speech Gestures dataset corroborate the efficacy of our proposed model. Compared with Transformer-based architecture, the assessments reveal that our approach delivers competitive results and significantly reduces memory usage, approximately 2.4 times, and enhances inference speeds by 2 to 4 times. Additionally, we released the CCG dataset, a Chinese Co-Speech Gestures dataset, comprising 15.97 hours (six styles across five scenarios) of 3D full-body skeleton gesture motion performed by professional Chinese TV broadcasters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16729v1-abstract-full').style.display = 'none'; document.getElementById('2411.16729v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12297">arXiv:2411.12297</a> <span> [<a href="https://arxiv.org/pdf/2411.12297">pdf</a>, <a href="https://arxiv.org/format/2411.12297">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Quantum Indistinguishable Obfuscation via Quantum Circuit Equivalence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+Y">Yuanjing Zhang</a>, <a href="/search/?searchtype=author&query=Shang%2C+T">Tao Shang</a>, <a href="/search/?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/?searchtype=author&query=Zhang%2C+C">Chenyi Zhang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haohua Du</a>, <a href="/search/?searchtype=author&query=Guo%2C+X">Xueyi Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12297v1-abstract-short" style="display: inline;"> Quantum computing solutions are increasingly deployed in commercial environments through delegated computing, especially one of the most critical issues is to guarantee the confidentiality and proprietary of quantum implementations. Since the proposal of general-purpose indistinguishability obfuscation (iO) and functional encryption schemes, iO has emerged as a seemingly versatile cryptography pri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12297v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12297v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12297v1-abstract-full" style="display: none;"> Quantum computing solutions are increasingly deployed in commercial environments through delegated computing, especially one of the most critical issues is to guarantee the confidentiality and proprietary of quantum implementations. Since the proposal of general-purpose indistinguishability obfuscation (iO) and functional encryption schemes, iO has emerged as a seemingly versatile cryptography primitive. Existing research on quantum indistinguishable obfuscation (QiO) primarily focuses on task-oriented, lacking solutions to general quantum computing. In this paper, we propose a scheme for constructing QiO via the equivalence of quantum circuits. It introduces the concept of quantum subpath sum equivalence, demonstrating that indistinguishability between two quantum circuits can be achieved by incremental changes in quantum subpaths. The restriction of security loss is solved by reducing the distinguisher to polynomial probability test. The scheme obfuscates the quantum implementation of classical functions in a path-sum specification, ensuring the indistinguishability between different quantum implementations. The results demonstrate the feasibility of indistinguishability obfuscation for general circuits and provide novel insights on intellectual property protection and secure delegated quantum computing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12297v1-abstract-full').style.display = 'none'; document.getElementById('2411.12297v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12268">arXiv:2411.12268</a> <span> [<a href="https://arxiv.org/pdf/2411.12268">pdf</a>, <a href="https://arxiv.org/format/2411.12268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Neural Denoising Vocoder for Clean Waveform Generation from Noisy Mel-Spectrogram based on Amplitude and Phase Predictions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12268v1-abstract-short" style="display: inline;"> This paper proposes a novel neural denoising vocoder that can generate clean speech waveforms from noisy mel-spectrograms. The proposed neural denoising vocoder consists of two components, i.e., a spectrum predictor and a enhancement module. The spectrum predictor first predicts the noisy amplitude and phase spectra from the input noisy mel-spectrogram, and subsequently the enhancement module reco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12268v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12268v1-abstract-full" style="display: none;"> This paper proposes a novel neural denoising vocoder that can generate clean speech waveforms from noisy mel-spectrograms. The proposed neural denoising vocoder consists of two components, i.e., a spectrum predictor and a enhancement module. The spectrum predictor first predicts the noisy amplitude and phase spectra from the input noisy mel-spectrogram, and subsequently the enhancement module recovers the clean amplitude and phase spectrum from noisy ones. Finally, clean speech waveforms are reconstructed through inverse short-time Fourier transform (iSTFT). All operations are performed at the frame-level spectral domain, with the APNet vocoder and MP-SENet speech enhancement model used as the backbones for the two components, respectively. Experimental results demonstrate that our proposed neural denoising vocoder achieves state-of-the-art performance compared to existing neural vocoders on the VoiceBank+DEMAND dataset. Additionally, despite the lack of phase information and partial amplitude information in the input mel-spectrogram, the proposed neural denoising vocoder still achieves comparable performance with the serveral advanced speech enhancement methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12268v1-abstract-full').style.display = 'none'; document.getElementById('2411.12268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NCMMSC2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11929">arXiv:2411.11929</a> <span> [<a href="https://arxiv.org/pdf/2411.11929">pdf</a>, <a href="https://arxiv.org/format/2411.11929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> ChatHTTPFuzz: Large Language Model-Assisted IoT HTTP Fuzzing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yang%2C+Z">Zhe Yang</a>, <a href="/search/?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/?searchtype=author&query=Jiang%2C+Y">Yanling Jiang</a>, <a href="/search/?searchtype=author&query=Li%2C+X">Xingwei Li</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haohua Du</a>, <a href="/search/?searchtype=author&query=Wang%2C+S">Shuhai Wang</a>, <a href="/search/?searchtype=author&query=Liu%2C+J">Jianwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11929v1-abstract-short" style="display: inline;"> Internet of Things (IoT) devices offer convenience through web interfaces, web VPNs, and other web-based services, all relying on the HTTP protocol. However, these externally exposed HTTP services resent significant security risks. Although fuzzing has shown some effectiveness in identifying vulnerabilities in IoT HTTP services, most state-of-the-art tools still rely on random mutation trategies,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11929v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11929v1-abstract-full" style="display: none;"> Internet of Things (IoT) devices offer convenience through web interfaces, web VPNs, and other web-based services, all relying on the HTTP protocol. However, these externally exposed HTTP services resent significant security risks. Although fuzzing has shown some effectiveness in identifying vulnerabilities in IoT HTTP services, most state-of-the-art tools still rely on random mutation trategies, leading to difficulties in accurately understanding the HTTP protocol's structure and generating many invalid test cases. Furthermore, These fuzzers rely on a limited set of initial seeds for testing. While this approach initiates testing, the limited number and diversity of seeds hinder comprehensive coverage of complex scenarios in IoT HTTP services. In this paper, we investigate and find that large language models (LLMs) excel in parsing HTTP protocol data and analyzing code logic. Based on these findings, we propose a novel LLM-guided IoT HTTP fuzzing method, ChatHTTPFuzz, which automatically parses protocol fields and analyzes service code logic to generate protocol-compliant test cases. Specifically, we use LLMs to label fields in HTTP protocol data, creating seed templates. Second, The LLM analyzes service code to guide the generation of additional packets aligned with the code logic, enriching the seed templates and their field values. Finally, we design an enhanced Thompson sampling algorithm based on the exploration balance factor and mutation potential factor to schedule seed templates. We evaluate ChatHTTPFuzz on 14 different real-world IoT devices. It finds more vulnerabilities than SNIPUZZ, BOOFUZZ, and MUTINY. ChatHTTPFuzz has discovered 103 vulnerabilities, of which 68 are unique, and 23 have been assigned CVEs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11929v1-abstract-full').style.display = 'none'; document.getElementById('2411.11929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11703">arXiv:2411.11703</a> <span> [<a href="https://arxiv.org/pdf/2411.11703">pdf</a>, <a href="https://arxiv.org/ps/2411.11703">ps</a>, <a href="https://arxiv.org/format/2411.11703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Analysis of PDEs">math.AP</span> </div> </div> <p class="title is-5 mathjax"> Construction of multi solitary waves with symmetry for the damped nonlinear Klein-Gordon equation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=C%C3%B4te%2C+R">Rapha毛l C么te</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haiming Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11703v1-abstract-short" style="display: inline;"> We are interested in the nonlinear damped Klein-Gordon equation \[ \partial_t^2 u+2伪\partial_t u-螖u+u-|u|^{p-1}u=0 \] on $\mathbb{R}^d$ for $2\le d\le 5$ and energy sub-critical exponents $2 < p < \frac{d+2}{d-2}$. We construct multi-solitons, that is, solutions which behave for large times as a sum of decoupled solitons, in various configurations with symmetry: this includes multi-solitons whos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11703v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11703v1-abstract-full" style="display: none;"> We are interested in the nonlinear damped Klein-Gordon equation \[ \partial_t^2 u+2伪\partial_t u-螖u+u-|u|^{p-1}u=0 \] on $\mathbb{R}^d$ for $2\le d\le 5$ and energy sub-critical exponents $2 < p < \frac{d+2}{d-2}$. We construct multi-solitons, that is, solutions which behave for large times as a sum of decoupled solitons, in various configurations with symmetry: this includes multi-solitons whose soliton centers lie at the vertices of an expanding regular polygon (with or without a center), of a regular polyhedron (with a center), or of a higher dimensional regular polytope. We give a precise description of these multi-solitons: in particular the interaction between nearest neighbour solitons is asymptotic to $\ln (t)$ as $t \to +\infty$. We also prove that in any multi-soliton, the solitons can not all share the same sign. Both statements generalize and precise results from \cite{F98}, \cite{Nak} and are based on the analysis developed in \cite{CMYZ,CMY}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11703v1-abstract-full').style.display = 'none'; document.getElementById('2411.11703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11340">arXiv:2411.11340</a> <span> [<a href="https://arxiv.org/pdf/2411.11340">pdf</a>, <a href="https://arxiv.org/format/2411.11340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Hybrid Loss Framework for Decomposition-based Time Series Forecasting Methods: Balancing Global and Component Errors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Han%2C+R">Ronghui Han</a>, <a href="/search/?searchtype=author&query=Feng%2C+D">Duanyu Feng</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyu Du</a>, <a href="/search/?searchtype=author&query=Wang%2C+H">Hao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11340v1-abstract-short" style="display: inline;"> Accurate time series forecasting, predicting future values based on past data, is crucial for diverse industries. Many current time series methods decompose time series into multiple sub-series, applying different model architectures and training with an end-to-end overall loss for forecasting. However, this raises a question: does this overall loss prioritize the importance of critical sub-series… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11340v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11340v1-abstract-full" style="display: none;"> Accurate time series forecasting, predicting future values based on past data, is crucial for diverse industries. Many current time series methods decompose time series into multiple sub-series, applying different model architectures and training with an end-to-end overall loss for forecasting. However, this raises a question: does this overall loss prioritize the importance of critical sub-series within the decomposition for the better performance? To investigate this, we conduct a study on the impact of overall loss on existing time series methods with sequence decomposition. Our findings reveal that overall loss may introduce bias in model learning, hindering the learning of the prioritization of more significant sub-series and limiting the forecasting performance. To address this, we propose a hybrid loss framework combining the global and component losses. This framework introduces component losses for each sub-series alongside the original overall loss. It employs a dual min-max algorithm to dynamically adjust weights between the overall loss and component losses, and within component losses. This enables the model to achieve better performance of current time series methods by focusing on more critical sub-series while still maintaining a low overall loss. We integrate our loss framework into several time series methods and evaluate the performance on multiple datasets. Results show an average improvement of 0.5-2% over existing methods without any modifications to the model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11340v1-abstract-full').style.display = 'none'; document.getElementById('2411.11340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11258">arXiv:2411.11258</a> <span> [<a href="https://arxiv.org/pdf/2411.11258">pdf</a>, <a href="https://arxiv.org/format/2411.11258">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ESTVocoder: An Excitation-Spectral-Transformed Neural Vocoder Conditioned on Mel Spectrogram </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Jiang%2C+X">Xiao-Hang Jiang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11258v1-abstract-short" style="display: inline;"> This paper proposes ESTVocoder, a novel excitation-spectral-transformed neural vocoder within the framework of source-filter theory. The ESTVocoder transforms the amplitude and phase spectra of the excitation into the corresponding speech amplitude and phase spectra using a neural filter whose backbone is ConvNeXt v2 blocks. Finally, the speech waveform is reconstructed through the inverse short-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11258v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11258v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11258v1-abstract-full" style="display: none;"> This paper proposes ESTVocoder, a novel excitation-spectral-transformed neural vocoder within the framework of source-filter theory. The ESTVocoder transforms the amplitude and phase spectra of the excitation into the corresponding speech amplitude and phase spectra using a neural filter whose backbone is ConvNeXt v2 blocks. Finally, the speech waveform is reconstructed through the inverse short-time Fourier transform (ISTFT). The excitation is constructed based on the F0: for voiced segments, it contains full harmonic information, while for unvoiced segments, it is represented by noise. The excitation provides the filter with prior knowledge of the amplitude and phase patterns, expecting to reduce the modeling difficulty compared to conventional neural vocoders. To ensure the fidelity of the synthesized speech, an adversarial training strategy is applied to ESTVocoder with multi-scale and multi-resolution discriminators. Analysis-synthesis and text-to-speech experiments both confirm that our proposed ESTVocoder outperforms or is comparable to other baseline neural vocoders, e.g., HiFi-GAN, SiFi-GAN, and Vocos, in terms of synthesized speech quality, with a reasonable model complexity and generation speed. Additional analysis experiments also demonstrate that the introduced excitation effectively accelerates the model's convergence process, thanks to the speech spectral prior information contained in the excitation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11258v1-abstract-full').style.display = 'none'; document.getElementById('2411.11258v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NCMMSC2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11232">arXiv:2411.11232</a> <span> [<a href="https://arxiv.org/pdf/2411.11232">pdf</a>, <a href="https://arxiv.org/format/2411.11232">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> SAMOS: A Neural MOS Prediction Model Leveraging Semantic Representations and Acoustic Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Shi%2C+Y">Yu-Fei Shi</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11232v1-abstract-short" style="display: inline;"> Assessing the naturalness of speech using mean opinion score (MOS) prediction models has positive implications for the automatic evaluation of speech synthesis systems. Early MOS prediction models took the raw waveform or amplitude spectrum of speech as input, whereas more advanced methods employed self-supervised-learning (SSL) based models to extract semantic representations from speech for MOS… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11232v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11232v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11232v1-abstract-full" style="display: none;"> Assessing the naturalness of speech using mean opinion score (MOS) prediction models has positive implications for the automatic evaluation of speech synthesis systems. Early MOS prediction models took the raw waveform or amplitude spectrum of speech as input, whereas more advanced methods employed self-supervised-learning (SSL) based models to extract semantic representations from speech for MOS prediction. These methods utilized limited aspects of speech information for MOS prediction, resulting in restricted prediction accuracy. Therefore, in this paper, we propose SAMOS, a MOS prediction model that leverages both Semantic and Acoustic information of speech to be assessed. Specifically, the proposed SAMOS leverages a pretrained wav2vec2 to extract semantic representations and uses the feature extractor of a pretrained BiVocoder to extract acoustic features. These two types of features are then fed into the prediction network, which includes multi-task heads and an aggregation layer, to obtain the final MOS score. Experimental results demonstrate that the proposed SAMOS outperforms current state-of-the-art MOS prediction models on the BVCC dataset and performs comparable performance on the BC2019 dataset, according to the results of system-level evaluation metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11232v1-abstract-full').style.display = 'none'; document.getElementById('2411.11232v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11123">arXiv:2411.11123</a> <span> [<a href="https://arxiv.org/pdf/2411.11123">pdf</a>, <a href="https://arxiv.org/format/2411.11123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Pitch-and-Spectrum-Aware Singing Quality Assessment with Bias Correction and Model Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Shi%2C+Y">Yu-Fei Shi</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11123v1-abstract-short" style="display: inline;"> We participated in track 2 of the VoiceMOS Challenge 2024, which aimed to predict the mean opinion score (MOS) of singing samples. Our submission secured the first place among all participating teams, excluding the official baseline. In this paper, we further improve our submission and propose a novel Pitch-and-Spectrum-aware Singing Quality Assessment (PS-SQA) method. The PS-SQA is designed based… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11123v1-abstract-full" style="display: none;"> We participated in track 2 of the VoiceMOS Challenge 2024, which aimed to predict the mean opinion score (MOS) of singing samples. Our submission secured the first place among all participating teams, excluding the official baseline. In this paper, we further improve our submission and propose a novel Pitch-and-Spectrum-aware Singing Quality Assessment (PS-SQA) method. The PS-SQA is designed based on the self-supervised-learning (SSL) MOS predictor, incorporating singing pitch and spectral information, which are extracted using pitch histogram and non-quantized neural codec, respectively. Additionally, the PS-SQA introduces a bias correction strategy to address prediction biases caused by low-resource training samples, and employs model fusion technology to further enhance prediction accuracy. Experimental results confirm that our proposed PS-SQA significantly outperforms all competing systems across all system-level metrics, confirming its strong sing quality assessment capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11123v1-abstract-full').style.display = 'none'; document.getElementById('2411.11123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09846">arXiv:2411.09846</a> <span> [<a href="https://arxiv.org/pdf/2411.09846">pdf</a>, <a href="https://arxiv.org/format/2411.09846">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Propagated Infection to Crossfire Mutants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Hang Du</a>, <a href="/search/?searchtype=author&query=Palepu%2C+V+K">Vijay Krishna Palepu</a>, <a href="/search/?searchtype=author&query=Jones%2C+J+A">James A. Jones</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09846v1-abstract-short" style="display: inline;"> Mutation testing was proposed to identify weaknesses in test suites by repeatedly generating artificially faulty versions of the software (mutants) and determining if the test suite is sufficient to detect them (kill them). When the tests are insufficient, each surviving mutant provides an opportunity to improve the test suite. We conducted a study and found that many such surviving mutants (up to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09846v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09846v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09846v1-abstract-full" style="display: none;"> Mutation testing was proposed to identify weaknesses in test suites by repeatedly generating artificially faulty versions of the software (mutants) and determining if the test suite is sufficient to detect them (kill them). When the tests are insufficient, each surviving mutant provides an opportunity to improve the test suite. We conducted a study and found that many such surviving mutants (up to 84% for the subjects of our study) are detectable by simply augmenting existing tests with additional assertions, or assertion amplification. Moreover, we find that many of these mutants are detectable by multiple existing tests, giving developers options for how to detect them. To help with these challenges, we created a technique that performs memory-state analysis to identify candidate assertions that developers can use to detect the surviving mutants. Additionally, we build upon prior research that identifies ``crossfiring'' opportunities -- tests that coincidentally kill multiple mutants. To this end, we developed a theoretical model that describes the varying granularities that crossfiring can occur in the existing test suite, which provide opportunities and options for how to kill surviving mutants. We operationalize this model to an accompanying technique that optimizes the assertion amplification of the existing tests to crossfire multiple mutants with fewer added assertions, optionally concentrated within fewer tests. Our experiments show that we can kill all surviving mutants that are detectable with existing test data with only 1.1% of the identified assertion candidates, and increasing by a factor of 6x, on average, the number of killed mutants from amplified tests, over tests that do not crossfire. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09846v1-abstract-full').style.display = 'none'; document.getElementById('2411.09846v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICSE '25</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.5 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09712">arXiv:2411.09712</a> <span> [<a href="https://arxiv.org/pdf/2411.09712">pdf</a>, <a href="https://arxiv.org/format/2411.09712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Space-Air-Ground Integrated MEC-Assisted Industrial Cyber-Physical Systems: An Online Decentralized Optimization Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=He%2C+L">Long He</a>, <a href="/search/?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/?searchtype=author&query=Sun%2C+Z">Zemin Sun</a>, <a href="/search/?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Liu%2C+J">Jiangchuan Liu</a>, <a href="/search/?searchtype=author&query=Leung%2C+V+C+M">Victor C. M. Leung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09712v1-abstract-short" style="display: inline;"> Cloud computing and edge/fog computing are playing a pivotal role in driving the transformation of industrial cyber-physical systems (ICPS) towards greater intelligence and automation by providing high-quality computation offloading services to Internet of Things devices (IoTDs). Recently, space-air-ground integrated multi-access edge computing (SAGIMEC) is emerging as a promising architecture com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09712v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09712v1-abstract-full" style="display: none;"> Cloud computing and edge/fog computing are playing a pivotal role in driving the transformation of industrial cyber-physical systems (ICPS) towards greater intelligence and automation by providing high-quality computation offloading services to Internet of Things devices (IoTDs). Recently, space-air-ground integrated multi-access edge computing (SAGIMEC) is emerging as a promising architecture combining edge computing and cloud computing, which has the potential to be integrated with ICPS to accelerate the realization of the above vision. In this work, we first present an SAGIMEC-assisted ICPS architecture that incorporates edge computing and cloud computing through seamless connectivity supported by satellite networks to achieve determinism in connectivity, networked computing, and intelligent networked control. Then, we formulate a joint satellite selection, computation offloading, communication resource allocation, computation resource allocation, and UAV trajectory control optimization problem (JSC4OP) to maximize the quality of service (QoS) of IoTDs. This problem considers both the dynamics and uncertainties of the system environment, as well as the limited resources and energy of UAVs. Given the complexity of JSC4OP, we propose an online decentralized optimization approach (ODOA) to solve the problem. Specifically, JSC4OP is first transformed into a real-time decision-making optimization problem (RDOP) by leveraging Lyapunov optimization. Then, to solve the RDOP, we introduce an online learning-based latency prediction method to predict the uncertain system environment and a game theoretic decision-making method to make real-time decisions. Finally, theoretical analysis confirms the effectiveness of the ODOA, while the simulation results demonstrate that the proposed ODOA outperforms other alternative approaches in terms of overall system performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09712v1-abstract-full').style.display = 'none'; document.getElementById('2411.09712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2406.11918</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08672">arXiv:2411.08672</a> <span> [<a href="https://arxiv.org/pdf/2411.08672">pdf</a>, <a href="https://arxiv.org/format/2411.08672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Joint Model Caching and Resource Allocation in Generative AI-Enabled Wireless Edge Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+Z">Zhang Liu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Huang%2C+L">Lianfen Huang</a>, <a href="/search/?searchtype=author&query=Gao%2C+Z">Zhibin Gao</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08672v1-abstract-short" style="display: inline;"> With the rapid advancement of artificial intelligence (AI), generative AI (GenAI) has emerged as a transformative tool, enabling customized and personalized AI-generated content (AIGC) services. However, GenAI models with billions of parameters require substantial memory capacity and computational power for deployment and execution, presenting significant challenges to resource-limited edge networ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08672v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08672v1-abstract-full" style="display: none;"> With the rapid advancement of artificial intelligence (AI), generative AI (GenAI) has emerged as a transformative tool, enabling customized and personalized AI-generated content (AIGC) services. However, GenAI models with billions of parameters require substantial memory capacity and computational power for deployment and execution, presenting significant challenges to resource-limited edge networks. In this paper, we address the joint model caching and resource allocation problem in GenAI-enabled wireless edge networks. Our objective is to balance the trade-off between delivering high-quality AIGC and minimizing the delay in AIGC service provisioning. To tackle this problem, we employ a deep deterministic policy gradient (DDPG)-based reinforcement learning approach, capable of efficiently determining optimal model caching and resource allocation decisions for AIGC services in response to user mobility and time-varying channel conditions. Numerical results demonstrate that DDPG achieves a higher model hit ratio and provides superior-quality, lower-latency AIGC services compared to other benchmark solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08672v1-abstract-full').style.display = 'none'; document.getElementById('2411.08672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">conference paper with 6 pages and 5 figures. arXiv admin note: text overlap with arXiv:2411.01458</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06928">arXiv:2411.06928</a> <span> [<a href="https://arxiv.org/pdf/2411.06928">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Electroencephalogram-based Multi-class Decoding of Attended Speakers' Direction with Audio Spatial Spectrum </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+Y">Yuanming Zhang</a>, <a href="/search/?searchtype=author&query=Lu%2C+J">Jing Lu</a>, <a href="/search/?searchtype=author&query=Lin%2C+Z">Zhibin Lin</a>, <a href="/search/?searchtype=author&query=Chen%2C+F">Fei Chen</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haoliang Du</a>, <a href="/search/?searchtype=author&query=Gao%2C+X">Xia Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06928v1-abstract-short" style="display: inline;"> Decoding the directional focus of an attended speaker from listeners' electroencephalogram (EEG) signals is essential for developing brain-computer interfaces to improve the quality of life for individuals with hearing impairment. Previous works have concentrated on binary directional focus decoding, i.e., determining whether the attended speaker is on the left or right side of the listener. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06928v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06928v1-abstract-full" style="display: none;"> Decoding the directional focus of an attended speaker from listeners' electroencephalogram (EEG) signals is essential for developing brain-computer interfaces to improve the quality of life for individuals with hearing impairment. Previous works have concentrated on binary directional focus decoding, i.e., determining whether the attended speaker is on the left or right side of the listener. However, a more precise decoding of the exact direction of the attended speaker is necessary for effective speech processing. Additionally, audio spatial information has not been effectively leveraged, resulting in suboptimal decoding results. In this paper, we observe that, on our recently presented dataset with 15-class directional focus, models relying exclusively on EEG inputs exhibits significantly lower accuracy when decoding the directional focus in both leave-one-subject-out and leave-one-trial-out scenarios. By integrating audio spatial spectra with EEG features, the decoding accuracy can be effectively improved. We employ the CNN, LSM-CNN, and EEG-Deformer models to decode the directional focus from listeners' EEG signals with the auxiliary audio spatial spectra. The proposed Sp-Aux-Deformer model achieves notable 15-class decoding accuracies of 57.48% and 61.83% in leave-one-subject-out and leave-one-trial-out scenarios, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06928v1-abstract-full').style.display = 'none'; document.getElementById('2411.06928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04137">arXiv:2411.04137</a> <span> [<a href="https://arxiv.org/pdf/2411.04137">pdf</a>, <a href="https://arxiv.org/format/2411.04137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative AI Enabled Matching for 6G Multiple Access </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Wang%2C+X">Xudong Wang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Zhou%2C+L">Lijie Zhou</a>, <a href="/search/?searchtype=author&query=Feng%2C+L">Lei Feng</a>, <a href="/search/?searchtype=author&query=Yang%2C+Z">Zhixiang Yang</a>, <a href="/search/?searchtype=author&query=Zhou%2C+F">Fanqin Zhou</a>, <a href="/search/?searchtype=author&query=Li%2C+W">Wenjing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04137v1-abstract-short" style="display: inline;"> In wireless networks, applying deep learning models to solve matching problems between different entities has become a mainstream and effective approach. However, the complex network topology in 6G multiple access presents significant challenges for the real-time performance and stability of matching generation. Generative artificial intelligence (GenAI) has demonstrated strong capabilities in gra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04137v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04137v1-abstract-full" style="display: none;"> In wireless networks, applying deep learning models to solve matching problems between different entities has become a mainstream and effective approach. However, the complex network topology in 6G multiple access presents significant challenges for the real-time performance and stability of matching generation. Generative artificial intelligence (GenAI) has demonstrated strong capabilities in graph feature extraction, exploration, and generation, offering potential for graph-structured matching generation. In this paper, we propose a GenAI-enabled matching generation framework to support 6G multiple access. Specifically, we first summarize the classical matching theory, discuss common GenAI models and applications from the perspective of matching generation. Then, we propose a framework based on generative diffusion models (GDMs) that iteratively denoises toward reward maximization to generate a matching strategy that meets specific requirements. Experimental results show that, compared to decision-based AI approaches, our framework can generate more effective matching strategies based on given conditions and predefined rewards, helping to solve complex problems in 6G multiple access, such as task allocation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04137v1-abstract-full').style.display = 'none'; document.getElementById('2411.04137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages,5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02863">arXiv:2411.02863</a> <span> [<a href="https://arxiv.org/pdf/2411.02863">pdf</a>, <a href="https://arxiv.org/format/2411.02863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> LoopSCC: Towards Summarizing Multi-branch Loops within Determinate Cycles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhu%2C+K">Kai Zhu</a>, <a href="/search/?searchtype=author&query=Guo%2C+C">Chenkai Guo</a>, <a href="/search/?searchtype=author&query=Yan%2C+K">Kuihao Yan</a>, <a href="/search/?searchtype=author&query=Jia%2C+X">Xiaoqi Jia</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haichao Du</a>, <a href="/search/?searchtype=author&query=Huang%2C+Q">Qingjia Huang</a>, <a href="/search/?searchtype=author&query=Xie%2C+Y">Yamin Xie</a>, <a href="/search/?searchtype=author&query=Tang%2C+J">Jing Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02863v1-abstract-short" style="display: inline;"> Analyzing programs with loops is a challenging task, suffering from potential issues such as indeterminate number of iterations and exponential growth of control flow complexity. Loop summarization, as a static analysis method for concrete semantic interpretation, receives increasing focuses. It produces symbolic expressions semantically equivalent to the loop program. However, current loop summar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02863v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02863v1-abstract-full" style="display: none;"> Analyzing programs with loops is a challenging task, suffering from potential issues such as indeterminate number of iterations and exponential growth of control flow complexity. Loop summarization, as a static analysis method for concrete semantic interpretation, receives increasing focuses. It produces symbolic expressions semantically equivalent to the loop program. However, current loop summarization methods are only suitable for single-branch loops or multi-branch loops with simple cycles, without supporting complex loops with irregular branch-to-branch transitions. In this paper, we proposed LoopSCC, a novel loop summarization technique, to achieve concrete semantic interpretation on complex loop. LoopSCC analyzes the control flow at the granularity of single-loop-path and applies the strongly connected components (SCC for short) for contraction and simplification, resulting in the contracted single-loop-path graph (CSG for short). Based on the control flow information provided by the CSG, we can convert the loop summary into a combination of SCC summaries. When an SCC contains irregular branch-to-branch transitions, we propose to explore a convergent range to identify the determinate cycles of different execution paths, referred as oscillatory interval. The loop summarization composed of both iteration conditions and execution operations can eventually be derived recursively. Extensive experiments compared to six state-of-the-art loop interpretation methods are conducted to evaluate the effectiveness of LoopSCC. From the results, LoopSCC outperforms comparative methods in both interpretation accuracy and application effectiveness. Especially, LoopSCC achieves a 100% interpretation accuracy on public common-used benchmark. A systematical study for loop properties on three large-scale programs illustrates that LoopSCC presents outstanding scalability for real-world loop programs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02863v1-abstract-full').style.display = 'none'; document.getElementById('2411.02863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01535">arXiv:2411.01535</a> <span> [<a href="https://arxiv.org/pdf/2411.01535">pdf</a>, <a href="https://arxiv.org/format/2411.01535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Customized Subgraph Selection and Encoding for Drug-drug Interaction Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Haotong Du</a>, <a href="/search/?searchtype=author&query=Yao%2C+Q">Quanming Yao</a>, <a href="/search/?searchtype=author&query=Zhang%2C+J">Juzheng Zhang</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/?searchtype=author&query=Wang%2C+Z">Zhen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01535v1-abstract-short" style="display: inline;"> Subgraph-based methods have proven to be effective and interpretable in predicting drug-drug interactions (DDIs), which are essential for medical practice and drug development. Subgraph selection and encoding are critical stages in these methods, yet customizing these components remains underexplored due to the high cost of manual adjustments. In this study, inspired by the success of neural archi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01535v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01535v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01535v1-abstract-full" style="display: none;"> Subgraph-based methods have proven to be effective and interpretable in predicting drug-drug interactions (DDIs), which are essential for medical practice and drug development. Subgraph selection and encoding are critical stages in these methods, yet customizing these components remains underexplored due to the high cost of manual adjustments. In this study, inspired by the success of neural architecture search (NAS), we propose a method to search for data-specific components within subgraph-based frameworks. Specifically, we introduce extensive subgraph selection and encoding spaces that account for the diverse contexts of drug interactions in DDI prediction. To address the challenge of large search spaces and high sampling costs, we design a relaxation mechanism that uses an approximation strategy to efficiently explore optimal subgraph configurations. This approach allows for robust exploration of the search space. Extensive experiments demonstrate the effectiveness and superiority of the proposed method, with the discovered subgraphs and encoding functions highlighting the model's adaptability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01535v1-abstract-full').style.display = 'none'; document.getElementById('2411.01535v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01458">arXiv:2411.01458</a> <span> [<a href="https://arxiv.org/pdf/2411.01458">pdf</a>, <a href="https://arxiv.org/format/2411.01458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Two-Timescale Model Caching and Resource Allocation for Edge-Enabled AI-Generated Content Services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+Z">Zhang Liu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Hou%2C+X">Xiangwang Hou</a>, <a href="/search/?searchtype=author&query=Huang%2C+L">Lianfen Huang</a>, <a href="/search/?searchtype=author&query=Hosseinalipour%2C+S">Seyyedali Hosseinalipour</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Letaief%2C+K+B">Khaled Ben Letaief</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01458v1-abstract-short" style="display: inline;"> Generative AI (GenAI) has emerged as a transformative technology, enabling customized and personalized AI-generated content (AIGC) services. In this paper, we address challenges of edge-enabled AIGC service provisioning, which remain underexplored in the literature. These services require executing GenAI models with billions of parameters, posing significant obstacles to resource-limited wireless… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01458v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01458v1-abstract-full" style="display: none;"> Generative AI (GenAI) has emerged as a transformative technology, enabling customized and personalized AI-generated content (AIGC) services. In this paper, we address challenges of edge-enabled AIGC service provisioning, which remain underexplored in the literature. These services require executing GenAI models with billions of parameters, posing significant obstacles to resource-limited wireless edge. We subsequently introduce the formulation of joint model caching and resource allocation for AIGC services to balance a trade-off between AIGC quality and latency metrics. We obtain mathematical relationships of these metrics with the computational resources required by GenAI models via experimentation. Afterward, we decompose the formulation into a model caching subproblem on a long-timescale and a resource allocation subproblem on a short-timescale. Since the variables to be solved are discrete and continuous, respectively, we leverage a double deep Q-network (DDQN) algorithm to solve the former subproblem and propose a diffusion-based deep deterministic policy gradient (D3PG) algorithm to solve the latter. The proposed D3PG algorithm makes an innovative use of diffusion models as the actor network to determine optimal resource allocation decisions. Consequently, we integrate these two learning methods within the overarching two-timescale deep reinforcement learning (T2DRL) algorithm, the performance of which is studied through comparative numerical simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01458v1-abstract-full').style.display = 'none'; document.getElementById('2411.01458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 8 figures, 39 references</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01159">arXiv:2411.01159</a> <span> [<a href="https://arxiv.org/pdf/2411.01159">pdf</a>, <a href="https://arxiv.org/format/2411.01159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Supervised Score-Based Modeling by Gradient Boosting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhao%2C+C">Changyuan Zhao</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Liu%2C+G">Guangyuan Liu</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01159v1-abstract-short" style="display: inline;"> Score-based generative models can effectively learn the distribution of data by estimating the gradient of the distribution. Due to the multi-step denoising characteristic, researchers have recently considered combining score-based generative models with the gradient boosting algorithm, a multi-step supervised learning algorithm, to solve supervised learning tasks. However, existing generative mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01159v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01159v1-abstract-full" style="display: none;"> Score-based generative models can effectively learn the distribution of data by estimating the gradient of the distribution. Due to the multi-step denoising characteristic, researchers have recently considered combining score-based generative models with the gradient boosting algorithm, a multi-step supervised learning algorithm, to solve supervised learning tasks. However, existing generative model algorithms are often limited by the stochastic nature of the models and the long inference time, impacting prediction performances. Therefore, we propose a Supervised Score-based Model (SSM), which can be viewed as a gradient boosting algorithm combining score matching. We provide a theoretical analysis of learning and sampling for SSM to balance inference time and prediction accuracy. Via the ablation experiment in selected examples, we demonstrate the outstanding performances of the proposed techniques. Additionally, we compare our model with other probabilistic models, including Natural Gradient Boosting (NGboost), Classification and Regression Diffusion Models (CARD), Diffusion Boosted Trees (DBT), and Bayesian neural network-based models. The experimental results show that our model outperforms existing models in both accuracy and inference time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01159v1-abstract-full').style.display = 'none'; document.getElementById('2411.01159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figure, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00464">arXiv:2411.00464</a> <span> [<a href="https://arxiv.org/pdf/2411.00464">pdf</a>, <a href="https://arxiv.org/format/2411.00464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MDCTCodec: A Lightweight MDCT-based Neural Audio Codec towards High Sampling Rate and Low Bitrate Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Jiang%2C+X">Xiao-Hang Jiang</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Zheng%2C+R">Rui-Chen Zheng</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00464v1-abstract-short" style="display: inline;"> In this paper, we propose MDCTCodec, an efficient lightweight end-to-end neural audio codec based on the modified discrete cosine transform (MDCT). The encoder takes the MDCT spectrum of audio as input, encoding it into a continuous latent code which is then discretized by a residual vector quantizer (RVQ). Subsequently, the decoder decodes the MDCT spectrum from the quantized latent code and reco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00464v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00464v1-abstract-full" style="display: none;"> In this paper, we propose MDCTCodec, an efficient lightweight end-to-end neural audio codec based on the modified discrete cosine transform (MDCT). The encoder takes the MDCT spectrum of audio as input, encoding it into a continuous latent code which is then discretized by a residual vector quantizer (RVQ). Subsequently, the decoder decodes the MDCT spectrum from the quantized latent code and reconstructs audio via inverse MDCT. During the training phase, a novel multi-resolution MDCT-based discriminator (MR-MDCTD) is adopted to discriminate the natural or decoded MDCT spectrum for adversarial training. Experimental results confirm that, in scenarios with high sampling rates and low bitrates, the MDCTCodec exhibited high decoded audio quality, improved training and generation efficiency, and compact model size compared to baseline codecs. Specifically, the MDCTCodec achieved a ViSQOL score of 4.18 at a sampling rate of 48 kHz and a bitrate of 6 kbps on the public VCTK corpus. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00464v1-abstract-full').style.display = 'none'; document.getElementById('2411.00464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 2024 IEEE Spoken Language Technology Workshop (SLT2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23563">arXiv:2410.23563</a> <span> [<a href="https://arxiv.org/pdf/2410.23563">pdf</a>, <a href="https://arxiv.org/format/2410.23563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Across-Platform Detection of Malicious Cryptocurrency Transactions via Account Interaction Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Che%2C+Z">Zheng Che</a>, <a href="/search/?searchtype=author&query=Shen%2C+M">Meng Shen</a>, <a href="/search/?searchtype=author&query=Tan%2C+Z">Zhehui Tan</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hanbiao Du</a>, <a href="/search/?searchtype=author&query=Zhu%2C+L">Liehuang Zhu</a>, <a href="/search/?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/?searchtype=author&query=Zhao%2C+Q">Qinglin Zhao</a>, <a href="/search/?searchtype=author&query=Xie%2C+Y">Yong Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23563v1-abstract-short" style="display: inline;"> With the rapid evolution of Web3.0, cryptocurrency has become a cornerstone of decentralized finance. While these digital assets enable efficient and borderless financial transactions, their pseudonymous nature has also attracted malicious activities such as money laundering, fraud, and other financial crimes. Effective detection of malicious transactions is crucial to maintaining the security and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23563v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23563v1-abstract-full" style="display: none;"> With the rapid evolution of Web3.0, cryptocurrency has become a cornerstone of decentralized finance. While these digital assets enable efficient and borderless financial transactions, their pseudonymous nature has also attracted malicious activities such as money laundering, fraud, and other financial crimes. Effective detection of malicious transactions is crucial to maintaining the security and integrity of the Web 3.0 ecosystem. Existing malicious transaction detection methods rely on large amounts of labeled data and suffer from low generalization. Label-efficient and generalizable malicious transaction detection remains a challenging task. In this paper, we propose ShadowEyes, a novel malicious transaction detection method. Specifically, we first propose a generalized graph structure named TxGraph as a representation of malicious transaction, which captures the interaction features of each malicious account and its neighbors. Then we carefully design a data augmentation method tailored to simulate the evolution of malicious transactions to generate positive pairs. To alleviate account label scarcity, we further design a graph contrastive mechanism, which enables ShadowEyes to learn discriminative features effectively from unlabeled data, thereby enhancing its detection capabilities in real-world scenarios. We conduct extensive experiments using public datasets to evaluate the performance of ShadowEyes. The results demonstrate that it outperforms state-of-the-art (SOTA) methods in four typical scenarios. Specifically, in the zero-shot learning scenario, it can achieve an F1 score of 76.98% for identifying gambling transactions, surpassing the SOTA method by12.05%. In the scenario of across-platform malicious transaction detection, ShadowEyes maintains an F1 score of around 90%, which is 10% higher than the SOTA method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23563v1-abstract-full').style.display = 'none'; document.getElementById('2410.23563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22807">arXiv:2410.22807</a> <span> [<a href="https://arxiv.org/pdf/2410.22807">pdf</a>, <a href="https://arxiv.org/format/2410.22807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> APCodec+: A Spectrum-Coding-Based High-Fidelity and High-Compression-Rate Neural Audio Codec with Staged Training Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Zheng%2C+R">Rui-Chen Zheng</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22807v1-abstract-short" style="display: inline;"> This paper proposes a novel neural audio codec, named APCodec+, which is an improved version of APCodec. The APCodec+ takes the audio amplitude and phase spectra as the coding object, and employs an adversarial training strategy. Innovatively, we propose a two-stage joint-individual training paradigm for APCodec+. In the joint training stage, the encoder, quantizer, decoder and discriminator are j… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22807v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22807v1-abstract-full" style="display: none;"> This paper proposes a novel neural audio codec, named APCodec+, which is an improved version of APCodec. The APCodec+ takes the audio amplitude and phase spectra as the coding object, and employs an adversarial training strategy. Innovatively, we propose a two-stage joint-individual training paradigm for APCodec+. In the joint training stage, the encoder, quantizer, decoder and discriminator are jointly trained with complete spectral loss, quantization loss, and adversarial loss. In the individual training stage, the encoder and quantizer fix their parameters and provide high-quality training data for the decoder and discriminator. The decoder and discriminator are individually trained from scratch without the quantization loss. The purpose of introducing individual training is to reduce the learning difficulty of the decoder, thereby further improving the fidelity of the decoded audio. Experimental results confirm that our proposed APCodec+ at low bitrates achieves comparable performance with baseline codecs at higher bitrates, thanks to the proposed staged training paradigm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22807v1-abstract-full').style.display = 'none'; document.getElementById('2410.22807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22112">arXiv:2410.22112</a> <span> [<a href="https://arxiv.org/pdf/2410.22112">pdf</a>, <a href="https://arxiv.org/format/2410.22112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Semantic Communication for Generative Audio-Driven Video Conferencing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Tong%2C+H">Haonan Tong</a>, <a href="/search/?searchtype=author&query=Li%2C+H">Haopeng Li</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Yang%2C+Z">Zhaohui Yang</a>, <a href="/search/?searchtype=author&query=Yin%2C+C">Changchuan Yin</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22112v1-abstract-short" style="display: inline;"> This paper studies an efficient multimodal data communication scheme for video conferencing. In our considered system, a speaker gives a talk to the audiences, with talking head video and audio being transmitted. Since the speaker does not frequently change posture and high-fidelity transmission of audio (speech and music) is required, redundant visual video data exists and can be removed by gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22112v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22112v1-abstract-full" style="display: none;"> This paper studies an efficient multimodal data communication scheme for video conferencing. In our considered system, a speaker gives a talk to the audiences, with talking head video and audio being transmitted. Since the speaker does not frequently change posture and high-fidelity transmission of audio (speech and music) is required, redundant visual video data exists and can be removed by generating the video from the audio. To this end, we propose a wave-to-video (Wav2Vid) system, an efficient video transmission framework that reduces transmitted data by generating talking head video from audio. In particular, full-duration audio and short-duration video data are synchronously transmitted through a wireless channel, with neural networks (NNs) extracting and encoding audio and video semantics. The receiver then combines the decoded audio and video data, as well as uses a generative adversarial network (GAN) based model to generate the lip movement videos of the speaker. Simulation results show that the proposed Wav2Vid system can reduce the amount of transmitted data by up to 83% while maintaining the perceptual quality of the generated conferencing video. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22112v1-abstract-full').style.display = 'none'; document.getElementById('2410.22112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by IEEE Wireless Communications Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19586">arXiv:2410.19586</a> <span> [<a href="https://arxiv.org/pdf/2410.19586">pdf</a>, <a href="https://arxiv.org/format/2410.19586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diverse Sign Language Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Shen%2C+X">Xin Shen</a>, <a href="/search/?searchtype=author&query=Shen%2C+L">Lei Shen</a>, <a href="/search/?searchtype=author&query=Yuan%2C+S">Shaozu Yuan</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Heming Du</a>, <a href="/search/?searchtype=author&query=Sun%2C+H">Haiyang Sun</a>, <a href="/search/?searchtype=author&query=Yu%2C+X">Xin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19586v1-abstract-short" style="display: inline;"> Like spoken languages, a single sign language expression could correspond to multiple valid textual interpretations. Hence, learning a rigid one-to-one mapping for sign language translation (SLT) models might be inadequate, particularly in the case of limited data. In this work, we introduce a Diverse Sign Language Translation (DivSLT) task, aiming to generate diverse yet accurate translations for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19586v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19586v1-abstract-full" style="display: none;"> Like spoken languages, a single sign language expression could correspond to multiple valid textual interpretations. Hence, learning a rigid one-to-one mapping for sign language translation (SLT) models might be inadequate, particularly in the case of limited data. In this work, we introduce a Diverse Sign Language Translation (DivSLT) task, aiming to generate diverse yet accurate translations for sign language videos. Firstly, we employ large language models (LLM) to generate multiple references for the widely-used CSL-Daily and PHOENIX14T SLT datasets. Here, native speakers are only invited to touch up inaccurate references, thus significantly improving the annotation efficiency. Secondly, we provide a benchmark model to spur research in this task. Specifically, we investigate multi-reference training strategies to enable our DivSLT model to achieve diverse translations. Then, to enhance translation accuracy, we employ the max-reward-driven reinforcement learning objective that maximizes the reward of the translated result. Additionally, we utilize multiple metrics to assess the accuracy, diversity, and semantic precision of the DivSLT task. Experimental results on the enriched datasets demonstrate that our DivSLT method achieves not only better translation performance but also diverse translation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19586v1-abstract-full').style.display = 'none'; document.getElementById('2410.19586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19488">arXiv:2410.19488</a> <span> [<a href="https://arxiv.org/pdf/2410.19488">pdf</a>, <a href="https://arxiv.org/format/2410.19488">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MM-WLAuslan: Multi-View Multi-Modal Word-Level Australian Sign Language Recognition Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Shen%2C+X">Xin Shen</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Heming Du</a>, <a href="/search/?searchtype=author&query=Sheng%2C+H">Hongwei Sheng</a>, <a href="/search/?searchtype=author&query=Wang%2C+S">Shuyun Wang</a>, <a href="/search/?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/?searchtype=author&query=Chen%2C+H">Huiqiang Chen</a>, <a href="/search/?searchtype=author&query=Wu%2C+Z">Zhuojie Wu</a>, <a href="/search/?searchtype=author&query=Du%2C+X">Xiaobiao Du</a>, <a href="/search/?searchtype=author&query=Ying%2C+J">Jiaying Ying</a>, <a href="/search/?searchtype=author&query=Lu%2C+R">Ruihan Lu</a>, <a href="/search/?searchtype=author&query=Xu%2C+Q">Qingzheng Xu</a>, <a href="/search/?searchtype=author&query=Yu%2C+X">Xin Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19488v1-abstract-short" style="display: inline;"> Isolated Sign Language Recognition (ISLR) focuses on identifying individual sign language glosses. Considering the diversity of sign languages across geographical regions, developing region-specific ISLR datasets is crucial for supporting communication and research. Auslan, as a sign language specific to Australia, still lacks a dedicated large-scale word-level dataset for the ISLR task. To fill t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19488v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19488v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19488v1-abstract-full" style="display: none;"> Isolated Sign Language Recognition (ISLR) focuses on identifying individual sign language glosses. Considering the diversity of sign languages across geographical regions, developing region-specific ISLR datasets is crucial for supporting communication and research. Auslan, as a sign language specific to Australia, still lacks a dedicated large-scale word-level dataset for the ISLR task. To fill this gap, we curate \underline{\textbf{the first}} large-scale Multi-view Multi-modal Word-Level Australian Sign Language recognition dataset, dubbed MM-WLAuslan. Compared to other publicly available datasets, MM-WLAuslan exhibits three significant advantages: (1) the largest amount of data, (2) the most extensive vocabulary, and (3) the most diverse of multi-modal camera views. Specifically, we record 282K+ sign videos covering 3,215 commonly used Auslan glosses presented by 73 signers in a studio environment. Moreover, our filming system includes two different types of cameras, i.e., three Kinect-V2 cameras and a RealSense camera. We position cameras hemispherically around the front half of the model and simultaneously record videos using all four cameras. Furthermore, we benchmark results with state-of-the-art methods for various multi-modal ISLR settings on MM-WLAuslan, including multi-view, cross-camera, and cross-view. Experiment results indicate that MM-WLAuslan is a challenging ISLR dataset, and we hope this dataset will contribute to the development of Auslan and the advancement of sign languages worldwide. All datasets and benchmarks are available at MM-WLAuslan. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19488v1-abstract-full').style.display = 'none'; document.getElementById('2410.19488v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17337">arXiv:2410.17337</a> <span> [<a href="https://arxiv.org/pdf/2410.17337">pdf</a>, <a href="https://arxiv.org/format/2410.17337">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Captions Speak Louder than Images (CASLIE): Generalizing Foundation Models for E-commerce from High-quality Multimodal Instruction Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Ling%2C+X">Xinyi Ling</a>, <a href="/search/?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hanwen Du</a>, <a href="/search/?searchtype=author&query=Zhu%2C+Z">Zhihui Zhu</a>, <a href="/search/?searchtype=author&query=Ning%2C+X">Xia Ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17337v1-abstract-short" style="display: inline;"> Leveraging multimodal data to drive breakthroughs in e-commerce applications through Multimodal Foundation Models (MFMs) is gaining increasing attention from the research community. However, there are significant challenges that hinder the optimal use of multimodal e-commerce data by foundation models: (1) the scarcity of large-scale, high-quality multimodal benchmark datasets; and (2) the lack of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17337v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17337v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17337v1-abstract-full" style="display: none;"> Leveraging multimodal data to drive breakthroughs in e-commerce applications through Multimodal Foundation Models (MFMs) is gaining increasing attention from the research community. However, there are significant challenges that hinder the optimal use of multimodal e-commerce data by foundation models: (1) the scarcity of large-scale, high-quality multimodal benchmark datasets; and (2) the lack of effective multimodal information integration methods. To address these challenges, in this paper, we introduce MMECInstruct, the first-ever, large-scale, and high-quality multimodal instruction dataset for e-commerce. We also develop CASLIE, a simple, lightweight, yet effective framework for integrating multimodal information for e-commerce. Leveraging MMECInstruct, we fine-tune a series of e-commerce MFMs within CASLIE, denoted as CASLIE models. Our comprehensive evaluation demonstrates that CASLIE models substantially outperform 5 categories of advanced baseline models in the in-domain evaluation. Moreover, CASLIE models show strong generalizability to out-of-domain settings. MMECInstruct and CASLIE models are publicly accessible through https://ninglab.github.io/CASLIE/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17337v1-abstract-full').style.display = 'none'; document.getElementById('2410.17337v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Xinyi Ling and Bo Peng contributed equally to this paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16669">arXiv:2410.16669</a> <span> [<a href="https://arxiv.org/pdf/2410.16669">pdf</a>, <a href="https://arxiv.org/format/2410.16669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Linear Partial Gromov-Wasserstein Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Bai%2C+Y">Yikun Bai</a>, <a href="/search/?searchtype=author&query=Kothapalli%2C+A">Abihith Kothapalli</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hengrong Du</a>, <a href="/search/?searchtype=author&query=Martin%2C+R+D">Rocio Diaz Martin</a>, <a href="/search/?searchtype=author&query=Kolouri%2C+S">Soheil Kolouri</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16669v2-abstract-short" style="display: inline;"> The Gromov Wasserstein (GW) problem, a variant of the classical optimal transport (OT) problem, has attracted growing interest in the machine learning and data science communities due to its ability to quantify similarity between measures in different metric spaces. However, like the classical OT problem, GW imposes an equal mass constraint between measures, which restricts its application in many… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16669v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16669v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16669v2-abstract-full" style="display: none;"> The Gromov Wasserstein (GW) problem, a variant of the classical optimal transport (OT) problem, has attracted growing interest in the machine learning and data science communities due to its ability to quantify similarity between measures in different metric spaces. However, like the classical OT problem, GW imposes an equal mass constraint between measures, which restricts its application in many machine learning tasks. To address this limitation, the partial Gromov-Wasserstein (PGW) problem has been introduced, which relaxes the equal mass constraint, enabling the comparison of general positive Radon measures. Despite this, both GW and PGW face significant computational challenges due to their non-convex nature. To overcome these challenges, we propose the linear partial Gromov-Wasserstein (LPGW) embedding, a linearized embedding technique for the PGW problem. For $K$ different metric measure spaces, the pairwise computation of the PGW distance requires solving the PGW problem $\mathcal{O}(K^2)$ times. In contrast, the proposed linearization technique reduces this to $\mathcal{O}(K)$ times. Similar to the linearization technique for the classical OT problem, we prove that LPGW defines a valid metric for metric measure spaces. Finally, we demonstrate the effectiveness of LPGW in practical applications such as shape retrieval and learning with transport-based embeddings, showing that LPGW preserves the advantages of PGW in partial matching while significantly enhancing computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16669v2-abstract-full').style.display = 'none'; document.getElementById('2410.16669v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13125">arXiv:2410.13125</a> <span> [<a href="https://arxiv.org/pdf/2410.13125">pdf</a>, <a href="https://arxiv.org/format/2410.13125">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Transformers4NewsRec: A Transformer-based News Recommendation Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+D">Dairui Liu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Honghui Du</a>, <a href="/search/?searchtype=author&query=Yang%2C+B">Boming Yang</a>, <a href="/search/?searchtype=author&query=Hurley%2C+N">Neil Hurley</a>, <a href="/search/?searchtype=author&query=Lawlor%2C+A">Aonghus Lawlor</a>, <a href="/search/?searchtype=author&query=Li%2C+I">Irene Li</a>, <a href="/search/?searchtype=author&query=Greene%2C+D">Derek Greene</a>, <a href="/search/?searchtype=author&query=Dong%2C+R">Ruihai Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13125v1-abstract-short" style="display: inline;"> Pre-trained transformer models have shown great promise in various natural language processing tasks, including personalized news recommendations. To harness the power of these models, we introduce Transformers4NewsRec, a new Python framework built on the \textbf{Transformers} library. This framework is designed to unify and compare the performance of various news recommendation models, including… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13125v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13125v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13125v1-abstract-full" style="display: none;"> Pre-trained transformer models have shown great promise in various natural language processing tasks, including personalized news recommendations. To harness the power of these models, we introduce Transformers4NewsRec, a new Python framework built on the \textbf{Transformers} library. This framework is designed to unify and compare the performance of various news recommendation models, including deep neural networks and graph-based models. Transformers4NewsRec offers flexibility in terms of model selection, data preprocessing, and evaluation, allowing both quantitative and qualitative analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13125v1-abstract-full').style.display = 'none'; document.getElementById('2410.13125v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12359">arXiv:2410.12359</a> <span> [<a href="https://arxiv.org/pdf/2410.12359">pdf</a>, <a href="https://arxiv.org/format/2410.12359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> ERVQ: Enhanced Residual Vector Quantization with Intra-and-Inter-Codebook Optimization for Neural Audio Codecs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zheng%2C+R">Rui-Chen Zheng</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Jiang%2C+X">Xiao-Hang Jiang</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12359v1-abstract-short" style="display: inline;"> Current neural audio codecs typically use residual vector quantization (RVQ) to discretize speech signals. However, they often experience codebook collapse, which reduces the effective codebook size and leads to suboptimal performance. To address this problem, we introduce ERVQ, Enhanced Residual Vector Quantization, a novel enhancement strategy for the RVQ framework in neural audio codecs. ERVQ m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12359v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12359v1-abstract-full" style="display: none;"> Current neural audio codecs typically use residual vector quantization (RVQ) to discretize speech signals. However, they often experience codebook collapse, which reduces the effective codebook size and leads to suboptimal performance. To address this problem, we introduce ERVQ, Enhanced Residual Vector Quantization, a novel enhancement strategy for the RVQ framework in neural audio codecs. ERVQ mitigates codebook collapse and boosts codec performance through both intra- and inter-codebook optimization. Intra-codebook optimization incorporates an online clustering strategy and a code balancing loss to ensure balanced and efficient codebook utilization. Inter-codebook optimization improves the diversity of quantized features by minimizing the similarity between successive quantizations. Our experiments show that ERVQ significantly enhances audio codec performance across different models, sampling rates, and bitrates, achieving superior quality and generalization capabilities. It also achieves 100% codebook utilization on one of the most advanced neural audio codecs. Further experiments indicate that audio codecs improved by the ERVQ strategy can improve unified speech-and-text large language models (LLMs). Specifically, there is a notable improvement in the naturalness of generated speech in downstream zero-shot text-to-speech tasks. Audio samples are available here. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12359v1-abstract-full').style.display = 'none'; document.getElementById('2410.12359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11582">arXiv:2410.11582</a> <span> [<a href="https://arxiv.org/pdf/2410.11582">pdf</a>, <a href="https://arxiv.org/format/2410.11582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> On-the-fly Modulation for Balanced Multimodal Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Wei%2C+Y">Yake Wei</a>, <a href="/search/?searchtype=author&query=Hu%2C+D">Di Hu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Henghui Du</a>, <a href="/search/?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11582v1-abstract-short" style="display: inline;"> Multimodal learning is expected to boost model performance by integrating information from different modalities. However, its potential is not fully exploited because the widely-used joint training strategy, which has a uniform objective for all modalities, leads to imbalanced and under-optimized uni-modal representations. Specifically, we point out that there often exists modality with more discr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11582v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11582v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11582v1-abstract-full" style="display: none;"> Multimodal learning is expected to boost model performance by integrating information from different modalities. However, its potential is not fully exploited because the widely-used joint training strategy, which has a uniform objective for all modalities, leads to imbalanced and under-optimized uni-modal representations. Specifically, we point out that there often exists modality with more discriminative information, e.g., vision of playing football and sound of blowing wind. They could dominate the joint training process, resulting in other modalities being significantly under-optimized. To alleviate this problem, we first analyze the under-optimized phenomenon from both the feed-forward and the back-propagation stages during optimization. Then, On-the-fly Prediction Modulation (OPM) and On-the-fly Gradient Modulation (OGM) strategies are proposed to modulate the optimization of each modality, by monitoring the discriminative discrepancy between modalities during training. Concretely, OPM weakens the influence of the dominant modality by dropping its feature with dynamical probability in the feed-forward stage, while OGM mitigates its gradient in the back-propagation stage. In experiments, our methods demonstrate considerable improvement across a variety of multimodal tasks. These simple yet effective strategies not only enhance performance in vanilla and task-oriented multimodal models, but also in more complex multimodal tasks, showcasing their effectiveness and flexibility. The source code is available at \url{https://github.com/GeWu-Lab/BML_TPAMI2024}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11582v1-abstract-full').style.display = 'none'; document.getElementById('2410.11582v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by T-PAMI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09580">arXiv:2410.09580</a> <span> [<a href="https://arxiv.org/pdf/2410.09580">pdf</a>, <a href="https://arxiv.org/format/2410.09580">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SAPIENT: Mastering Multi-turn Conversational Recommendation with Strategic Planning and Monte Carlo Tree Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Hanwen Du</a>, <a href="/search/?searchtype=author&query=Peng%2C+B">Bo Peng</a>, <a href="/search/?searchtype=author&query=Ning%2C+X">Xia Ning</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09580v1-abstract-short" style="display: inline;"> Conversational Recommender Systems (CRS) proactively engage users in interactive dialogues to elicit user preferences and provide personalized recommendations. Existing methods train Reinforcement Learning (RL)-based agent with greedy action selection or sampling strategy, and may suffer from suboptimal conversational planning. To address this, we present a novel Monte Carlo Tree Search (MCTS)-bas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09580v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09580v1-abstract-full" style="display: none;"> Conversational Recommender Systems (CRS) proactively engage users in interactive dialogues to elicit user preferences and provide personalized recommendations. Existing methods train Reinforcement Learning (RL)-based agent with greedy action selection or sampling strategy, and may suffer from suboptimal conversational planning. To address this, we present a novel Monte Carlo Tree Search (MCTS)-based CRS framework SAPIENT. SAPIENT consists of a conversational agent (S-agent) and a conversational planner (S-planner). S-planner builds a conversational search tree with MCTS based on the initial actions proposed by S-agent to find conversation plans. The best conversation plans from S-planner are used to guide the training of S-agent, creating a self-training loop where S-agent can iteratively improve its capability for conversational planning. Furthermore, we propose an efficient variant SAPIENT-e for trade-off between training efficiency and performance. Extensive experiments on four benchmark datasets validate the effectiveness of our approach, showing that SAPIENT outperforms the state-of-the-art baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09580v1-abstract-full').style.display = 'none'; document.getElementById('2410.09580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07739">arXiv:2410.07739</a> <span> [<a href="https://arxiv.org/pdf/2410.07739">pdf</a>, <a href="https://arxiv.org/format/2410.07739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SLIM: Let LLM Learn More and Forget Less with Soft LoRA and Identity Mixture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Han%2C+J">Jiayi Han</a>, <a href="/search/?searchtype=author&query=Du%2C+L">Liang Du</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongwei Du</a>, <a href="/search/?searchtype=author&query=Zhou%2C+X">Xiangguo Zhou</a>, <a href="/search/?searchtype=author&query=Wu%2C+Y">Yiwen Wu</a>, <a href="/search/?searchtype=author&query=Zheng%2C+W">Weibo Zheng</a>, <a href="/search/?searchtype=author&query=Han%2C+D">Donghong Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07739v1-abstract-short" style="display: inline;"> Although many efforts have been made, it is still a challenge to balance the training budget, downstream performance, and the general capabilities of the LLMs in many applications. Training the whole model for downstream tasks is expensive, and could easily result in catastrophic forgetting. By introducing parameter-efficient fine-tuning (PEFT), the training cost could be reduced, but it still suf… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07739v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07739v1-abstract-full" style="display: none;"> Although many efforts have been made, it is still a challenge to balance the training budget, downstream performance, and the general capabilities of the LLMs in many applications. Training the whole model for downstream tasks is expensive, and could easily result in catastrophic forgetting. By introducing parameter-efficient fine-tuning (PEFT), the training cost could be reduced, but it still suffers from forgetting, and limits the learning on the downstream tasks. To efficiently fine-tune the LLMs with less limitation to their downstream performance while mitigating the forgetting of general capabilities, we propose a novel mixture of expert (MoE) framework based on Soft LoRA and Identity Mixture (SLIM), that allows dynamic routing between LoRA adapters and skipping connection, enables the suppression of forgetting. We adopt weight-yielding with sliding clustering for better out-of-domain distinguish to enhance the routing. We also propose to convert the mixture of low-rank adapters to the model merging formulation and introduce fast dynamic merging of LoRA adapters to keep the general capabilities of the base model. Extensive experiments demonstrate that the proposed SLIM is comparable to the state-of-the-art PEFT approaches on the downstream tasks while achieving the leading performance in mitigating catastrophic forgetting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07739v1-abstract-full').style.display = 'none'; document.getElementById('2410.07739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 6 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04990">arXiv:2410.04990</a> <span> [<a href="https://arxiv.org/pdf/2410.04990">pdf</a>, <a href="https://arxiv.org/format/2410.04990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Stage-Wise and Prior-Aware Neural Speech Phase Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+F">Fei Liu</a>, <a href="/search/?searchtype=author&query=Ai%2C+Y">Yang Ai</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hui-Peng Du</a>, <a href="/search/?searchtype=author&query=Lu%2C+Y">Ye-Xin Lu</a>, <a href="/search/?searchtype=author&query=Zheng%2C+R">Rui-Chen Zheng</a>, <a href="/search/?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04990v1-abstract-short" style="display: inline;"> This paper proposes a novel Stage-wise and Prior-aware Neural Speech Phase Prediction (SP-NSPP) model, which predicts the phase spectrum from input amplitude spectrum by two-stage neural networks. In the initial prior-construction stage, we preliminarily predict a rough prior phase spectrum from the amplitude spectrum. The subsequent refinement stage transforms the amplitude spectrum into a refine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04990v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04990v1-abstract-full" style="display: none;"> This paper proposes a novel Stage-wise and Prior-aware Neural Speech Phase Prediction (SP-NSPP) model, which predicts the phase spectrum from input amplitude spectrum by two-stage neural networks. In the initial prior-construction stage, we preliminarily predict a rough prior phase spectrum from the amplitude spectrum. The subsequent refinement stage transforms the amplitude spectrum into a refined high-quality phase spectrum conditioned on the prior phase. Networks in both stages use ConvNeXt v2 blocks as the backbone and adopt adversarial training by innovatively introducing a phase spectrum discriminator (PSD). To further improve the continuity of the refined phase, we also incorporate a time-frequency integrated difference (TFID) loss in the refinement stage. Experimental results confirm that, compared to neural network-based no-prior phase prediction methods, the proposed SP-NSPP achieves higher phase prediction accuracy, thanks to introducing the coarse phase priors and diverse training criteria. Compared to iterative phase estimation algorithms, our proposed SP-NSPP does not require multiple rounds of staged iterations, resulting in higher generation efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04990v1-abstract-full').style.display = 'none'; document.getElementById('2410.04990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by SLT2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02238">arXiv:2410.02238</a> <span> [<a href="https://arxiv.org/pdf/2410.02238">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> Orbital torque switching of perpendicular magnetization in light metal/ferrimagnet bilayers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Xu%2C+T">Teng Xu</a>, <a href="/search/?searchtype=author&query=Tang%2C+A">Aihua Tang</a>, <a href="/search/?searchtype=author&query=Wang%2C+K">Kang Wang</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yizhou Liu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haifeng Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02238v1-abstract-short" style="display: inline;"> Orbital torque, associated with orbital current, enables light metals to efficiently manipulate magnetization with rich tunability. A clear demonstration of perpendicular magnetization switching using light metals alone is essential for understanding orbital physics and developing high-density orbitronic devices. Here, we report orbital torque switching of perpendicular magnetization in light meta… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02238v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02238v1-abstract-full" style="display: none;"> Orbital torque, associated with orbital current, enables light metals to efficiently manipulate magnetization with rich tunability. A clear demonstration of perpendicular magnetization switching using light metals alone is essential for understanding orbital physics and developing high-density orbitronic devices. Here, we report orbital torque switching of perpendicular magnetization in light metal (Ti, V, Cr)/ferrimagnet (Fe1-xGdx) bilayers. Taking the Ti/ Fe1-xGdx sample as a model system, the torque efficiency increases four-fold by enhancing the spin-orbit coupling in Fe1-xGdx through modulating Gd composition, which is a characteristic feature of orbital torque. Our findings demonstrate that light metals in combination with rare earth-transition metal ferrimagnets can be employed for efficient orbitronic devices and serve as a model system for studying orbitronics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02238v1-abstract-full').style.display = 'none'; document.getElementById('2410.02238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01240">arXiv:2410.01240</a> <span> [<a href="https://arxiv.org/pdf/2410.01240">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Automatic deductive coding in discourse analysis: an application of large language models in learning analytics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+L">Lishan Zhang</a>, <a href="/search/?searchtype=author&query=Wu%2C+H">Han Wu</a>, <a href="/search/?searchtype=author&query=Huang%2C+X">Xiaoshan Huang</a>, <a href="/search/?searchtype=author&query=Duan%2C+T">Tengfei Duan</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hanxiang Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01240v1-abstract-short" style="display: inline;"> Deductive coding is a common discourse analysis method widely used by learning science and learning analytics researchers for understanding teaching and learning interactions. It often requires researchers to manually label all discourses to be analyzed according to a theoretically guided coding scheme, which is time-consuming and labor-intensive. The emergence of large language models such as GPT… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01240v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01240v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01240v1-abstract-full" style="display: none;"> Deductive coding is a common discourse analysis method widely used by learning science and learning analytics researchers for understanding teaching and learning interactions. It often requires researchers to manually label all discourses to be analyzed according to a theoretically guided coding scheme, which is time-consuming and labor-intensive. The emergence of large language models such as GPT has opened a new avenue for automatic deductive coding to overcome the limitations of traditional deductive coding. To evaluate the usefulness of large language models in automatic deductive coding, we employed three different classification methods driven by different artificial intelligence technologies, including the traditional text classification method with text feature engineering, BERT-like pretrained language model and GPT-like pretrained large language model (LLM). We applied these methods to two different datasets and explored the potential of GPT and prompt engineering in automatic deductive coding. By analyzing and comparing the accuracy and Kappa values of these three classification methods, we found that GPT with prompt engineering outperformed the other two methods on both datasets with limited number of training samples. By providing detailed prompt structures, the reported work demonstrated how large language models can be used in the implementation of automatic deductive coding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01240v1-abstract-full').style.display = 'none'; document.getElementById('2410.01240v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19865">arXiv:2409.19865</a> <span> [<a href="https://arxiv.org/pdf/2409.19865">pdf</a>, <a href="https://arxiv.org/format/2409.19865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TokenBinder: Text-Video Retrieval with One-to-Many Alignment Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+B">Bingqing Zhang</a>, <a href="/search/?searchtype=author&query=Cao%2C+Z">Zhuo Cao</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Heming Du</a>, <a href="/search/?searchtype=author&query=Yu%2C+X">Xin Yu</a>, <a href="/search/?searchtype=author&query=Li%2C+X">Xue Li</a>, <a href="/search/?searchtype=author&query=Liu%2C+J">Jiajun Liu</a>, <a href="/search/?searchtype=author&query=Wang%2C+S">Sen Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19865v1-abstract-short" style="display: inline;"> Text-Video Retrieval (TVR) methods typically match query-candidate pairs by aligning text and video features in coarse-grained, fine-grained, or combined (coarse-to-fine) manners. However, these frameworks predominantly employ a one(query)-to-one(candidate) alignment paradigm, which struggles to discern nuanced differences among candidates, leading to frequent mismatches. Inspired by Comparative J… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19865v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19865v1-abstract-full" style="display: none;"> Text-Video Retrieval (TVR) methods typically match query-candidate pairs by aligning text and video features in coarse-grained, fine-grained, or combined (coarse-to-fine) manners. However, these frameworks predominantly employ a one(query)-to-one(candidate) alignment paradigm, which struggles to discern nuanced differences among candidates, leading to frequent mismatches. Inspired by Comparative Judgement in human cognitive science, where decisions are made by directly comparing items rather than evaluating them independently, we propose TokenBinder. This innovative two-stage TVR framework introduces a novel one-to-many coarse-to-fine alignment paradigm, imitating the human cognitive process of identifying specific items within a large collection. Our method employs a Focused-view Fusion Network with a sophisticated cross-attention mechanism, dynamically aligning and comparing features across multiple videos to capture finer nuances and contextual variations. Extensive experiments on six benchmark datasets confirm that TokenBinder substantially outperforms existing state-of-the-art methods. These results demonstrate its robustness and the effectiveness of its fine-grained alignment in bridging intra- and inter-modality information gaps in TVR tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19865v1-abstract-full').style.display = 'none'; document.getElementById('2409.19865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15894">arXiv:2409.15894</a> <span> [<a href="https://arxiv.org/pdf/2409.15894">pdf</a>, <a href="https://arxiv.org/ps/2409.15894">ps</a>, <a href="https://arxiv.org/format/2409.15894">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Robust Beamforming Design for Near-Field DMA-NOMA mmWave Communications With Imperfect Position Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Xiu%2C+Y">Yue Xiu</a>, <a href="/search/?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/?searchtype=author&query=Yang%2C+S">Songjie Yang</a>, <a href="/search/?searchtype=author&query=Zhang%2C+Y">Yufeng Zhang</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Wei%2C+N">Ning Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15894v1-abstract-short" style="display: inline;"> For millimeter-wave (mmWave) non-orthogonal multiple access (NOMA) communication systems, we propose an innovative near-field (NF) transmission framework based on dynamic metasurface antenna (DMA) technology. In this framework, a base station (BS) utilizes the DMA hybrid beamforming technology combined with the NOMA principle to maximize communication efficiency between near-field users (NUs) and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15894v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15894v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15894v1-abstract-full" style="display: none;"> For millimeter-wave (mmWave) non-orthogonal multiple access (NOMA) communication systems, we propose an innovative near-field (NF) transmission framework based on dynamic metasurface antenna (DMA) technology. In this framework, a base station (BS) utilizes the DMA hybrid beamforming technology combined with the NOMA principle to maximize communication efficiency between near-field users (NUs) and far-field users (FUs). In conventional communication systems, obtaining channel state information (CSI) requires substantial pilot signals, significantly reducing system communication efficiency. We propose a beamforming design scheme based on position information to address with this challenge. This scheme does not depend on pilot signals but indirectly obtains CSI by analyzing the geometric relationship between user position information and channel models. However, in practical applications, the accuracy of position information is challenging to guarantee and may contain errors. We propose a robust beamforming design strategy based on the worst-case scenario to tackle this issue. Facing with the multi-variable coupled non-convex problems, we employ a dual-loop iterative joint optimization algorithm to update beamforming using block coordinate descent (BCD) and derive the optimal power allocation (PA) expression. We analyze its convergence and complexity to verify the proposed algorithm's performance and robustness thoroughly. We validate the theoretical derivation of the CSI error bound through simulation experiments. Numerical results show that our proposed scheme performs better than traditional beamforming schemes. Additionally, the transmission framework exhibits strong robustness to NU and FU position errors, laying a solid foundation for the practical application of mmWave NOMA communication systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15894v1-abstract-full').style.display = 'none'; document.getElementById('2409.15894v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15750">arXiv:2409.15750</a> <span> [<a href="https://arxiv.org/pdf/2409.15750">pdf</a>, <a href="https://arxiv.org/format/2409.15750">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> The Roles of Generative Artificial Intelligence in Internet of Electric Vehicles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhang%2C+H">Hanwen Zhang</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/?searchtype=author&query=Zhao%2C+C">Changyuan Zhao</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Jamalipour%2C+A">Abbas Jamalipour</a>, <a href="/search/?searchtype=author&query=Sun%2C+S">Sumei Sun</a>, <a href="/search/?searchtype=author&query=Pei%2C+Y">Yiyang Pei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15750v3-abstract-short" style="display: inline;"> With the advancements of generative artificial intelligence (GenAI) models, their capabilities are expanding significantly beyond content generation and the models are increasingly being used across diverse applications. Particularly, GenAI shows great potential in addressing challenges in the electric vehicle (EV) ecosystem ranging from charging management to cyber-attack prevention. In this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15750v3-abstract-full').style.display = 'inline'; document.getElementById('2409.15750v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15750v3-abstract-full" style="display: none;"> With the advancements of generative artificial intelligence (GenAI) models, their capabilities are expanding significantly beyond content generation and the models are increasingly being used across diverse applications. Particularly, GenAI shows great potential in addressing challenges in the electric vehicle (EV) ecosystem ranging from charging management to cyber-attack prevention. In this paper, we specifically consider Internet of electric vehicles (IoEV) and we categorize GenAI for IoEV into four different layers namely, EV's battery layer, individual EV layer, smart grid layer, and security layer. We introduce various GenAI techniques used in each layer of IoEV applications. Subsequently, public datasets available for training the GenAI models are summarized. Finally, we provide recommendations for future directions. This survey not only categorizes the applications of GenAI in IoEV across different layers but also serves as a valuable resource for researchers and practitioners by highlighting the design and implementation challenges within each layer. Furthermore, it provides a roadmap for future research directions, enabling the development of more robust and efficient IoEV systems through the integration of advanced GenAI techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15750v3-abstract-full').style.display = 'none'; document.getElementById('2409.15750v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15695">arXiv:2409.15695</a> <span> [<a href="https://arxiv.org/pdf/2409.15695">pdf</a>, <a href="https://arxiv.org/format/2409.15695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Toward Mixture-of-Experts Enabled Trustworthy Semantic Communication for 6G Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=He%2C+J">Jiayi He</a>, <a href="/search/?searchtype=author&query=Luo%2C+X">Xiaofeng Luo</a>, <a href="/search/?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/?searchtype=author&query=Chen%2C+C">Ci Chen</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Shen%2C+X">Xuemin Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15695v1-abstract-short" style="display: inline;"> Semantic Communication (SemCom) plays a pivotal role in 6G networks, offering a viable solution for future efficient communication. Deep Learning (DL)-based semantic codecs further enhance this efficiency. However, the vulnerability of DL models to security threats, such as adversarial attacks, poses significant challenges for practical applications of SemCom systems. These vulnerabilities enable… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15695v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15695v1-abstract-full" style="display: none;"> Semantic Communication (SemCom) plays a pivotal role in 6G networks, offering a viable solution for future efficient communication. Deep Learning (DL)-based semantic codecs further enhance this efficiency. However, the vulnerability of DL models to security threats, such as adversarial attacks, poses significant challenges for practical applications of SemCom systems. These vulnerabilities enable attackers to tamper with messages and eavesdrop on private information, especially in wireless communication scenarios. Although existing defenses attempt to address specific threats, they often fail to simultaneously handle multiple heterogeneous attacks. To overcome this limitation, we introduce a novel Mixture-of-Experts (MoE)-based SemCom system. This system comprises a gating network and multiple experts, each specializing in different security challenges. The gating network adaptively selects suitable experts to counter heterogeneous attacks based on user-defined security requirements. Multiple experts collaborate to accomplish semantic communication tasks while meeting the security requirements of users. A case study in vehicular networks demonstrates the efficacy of the MoE-based SemCom system. Simulation results show that the proposed MoE-based SemCom system effectively mitigates concurrent heterogeneous attacks, with minimal impact on downstream task accuracy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15695v1-abstract-full').style.display = 'none'; document.getElementById('2409.15695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14702">arXiv:2409.14702</a> <span> [<a href="https://arxiv.org/pdf/2409.14702">pdf</a>, <a href="https://arxiv.org/ps/2409.14702">ps</a>, <a href="https://arxiv.org/format/2409.14702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Rate-Splitting for Cell-Free Massive MIMO: Performance Analysis and Generative AI Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zheng%2C+J">Jiakang Zheng</a>, <a href="/search/?searchtype=author&query=Zhang%2C+J">Jiayi Zhang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Dobre%2C+O+A">Octavia A. Dobre</a>, <a href="/search/?searchtype=author&query=Ai%2C+B">Bo Ai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14702v2-abstract-short" style="display: inline;"> Cell-free (CF) massive multiple-input multipleoutput (MIMO) provides a ubiquitous coverage to user equipments (UEs) but it is also susceptible to interference. Ratesplitting (RS) effectively extracts data by decoding interference, yet its effectiveness is limited by the weakest UE. In this paper, we investigate an RS-based CF massive MIMO system, which combines strengths and mitigates weaknesses o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14702v2-abstract-full').style.display = 'inline'; document.getElementById('2409.14702v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14702v2-abstract-full" style="display: none;"> Cell-free (CF) massive multiple-input multipleoutput (MIMO) provides a ubiquitous coverage to user equipments (UEs) but it is also susceptible to interference. Ratesplitting (RS) effectively extracts data by decoding interference, yet its effectiveness is limited by the weakest UE. In this paper, we investigate an RS-based CF massive MIMO system, which combines strengths and mitigates weaknesses of both approaches. Considering imperfect channel state information (CSI) resulting from both pilot contamination and noise, we derive a closed-form expression for the sum spectral efficiency (SE) of the RS-based CF massive MIMO system under a spatially correlated Rician channel. Moreover, we propose low-complexity heuristic algorithms based on statistical CSI for power-splitting of common messages and power-control of private messages, and genetic algorithm is adopted as a solution for upper bound performance. Furthermore, we formulate a joint optimization problem, aiming to maximize the sum SE of the RS-based CF massive MIMO system by optimizing the power-splitting factor and power-control coefficient. Importantly, we improve a generative AI (GAI) algorithm to address this complex and nonconvexity problem by using a diffusion model to obtain solutions. Simulation results demonstrate its effectiveness and practicality in mitigating interference, especially in dynamic environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14702v2-abstract-full').style.display = 'none'; document.getElementById('2409.14702v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 9 figures, Accepted in IEEE Transactions on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14031">arXiv:2409.14031</a> <span> [<a href="https://arxiv.org/pdf/2409.14031">pdf</a>, <a href="https://arxiv.org/format/2409.14031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Signal Detection in Near-field Communication with Unknown Noise Characteristics: A Diffusion Model Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Zhao%2C+C">Changyuan Zhao</a>, <a href="/search/?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14031v1-abstract-short" style="display: inline;"> In this letter, we present a diffusion model method for signal detection in near-field communication with unknown noise characteristics. We consider an uplink transmission of a near-filed MIMO communication system consisting of multiple mobile terminals and one base station with multiple antennas. Then, we proposed a Maximum Likelihood Estimation Diffusion Detector (MLEDD) aiming at learning the d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14031v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14031v1-abstract-full" style="display: none;"> In this letter, we present a diffusion model method for signal detection in near-field communication with unknown noise characteristics. We consider an uplink transmission of a near-filed MIMO communication system consisting of multiple mobile terminals and one base station with multiple antennas. Then, we proposed a Maximum Likelihood Estimation Diffusion Detector (MLEDD) aiming at learning the distribution of unknown noise. To this end, we define an error function via Bayes' theorem to detect the source signal. Moreover, we present an implementation of the proposed framework. The performance of the proposed method in terms of bit error rate shows that it outperforms the MLE detector, Detection Network (DetNet), and Maximum Normalizing Flow Estimate method (MANFE) across different signal-to-noise ratios and noise distributions. Especially when the noise distribution is intractable, diffusion, as a state-of-the-art probability model, has the best distribution learning ability compared to other models. These results affirm that this framework can effectively detect signals in near-field scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14031v1-abstract-full').style.display = 'none'; document.getElementById('2409.14031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11532">arXiv:2409.11532</a> <span> [<a href="https://arxiv.org/pdf/2409.11532">pdf</a>, <a href="https://arxiv.org/format/2409.11532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Enhancing the Reliability of LiDAR Point Cloud Sampling: A Colorization and Super-Resolution Approach Based on LiDAR-Generated Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Ha%2C+S">Sier Ha</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Honghao Du</a>, <a href="/search/?searchtype=author&query=Yu%2C+X">Xianjia Yu</a>, <a href="/search/?searchtype=author&query=Song%2C+J">Jian Song</a>, <a href="/search/?searchtype=author&query=Westerlund%2C+T">Tomi Westerlund</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11532v1-abstract-short" style="display: inline;"> In recent years, Light Detection and Ranging (LiDAR) technology, a critical sensor in robotics and autonomous systems, has seen significant advancements. These improvements include enhanced resolution of point clouds and the capability to provide 360掳 low-resolution images. These images encode various data such as depth, reflectivity, and near-infrared light within the pixels. However, an excessiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11532v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11532v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11532v1-abstract-full" style="display: none;"> In recent years, Light Detection and Ranging (LiDAR) technology, a critical sensor in robotics and autonomous systems, has seen significant advancements. These improvements include enhanced resolution of point clouds and the capability to provide 360掳 low-resolution images. These images encode various data such as depth, reflectivity, and near-infrared light within the pixels. However, an excessive density of points and conventional point cloud sampling can be counterproductive, particularly in applications such as LiDAR odometry, where misleading points and degraded geometry information may induce drift errors. Currently, extensive research efforts are being directed towards leveraging LiDAR-generated images to improve situational awareness. This paper presents a comprehensive review of current deep learning (DL) techniques, including colorization and super-resolution, which are traditionally utilized in conventional computer vision tasks. These techniques are applied to LiDAR-generated images and are analyzed qualitatively. Based on this analysis, we have developed a novel approach that selectively integrates the most suited colorization and super-resolution methods with LiDAR imagery to sample reliable points from the LiDAR point cloud. This approach aims to not only improve the accuracy of point cloud registration but also avoid mismatching caused by lacking geometry information, thereby augmenting the utility and precision of LiDAR systems in practical applications. In our evaluation, the proposed approach demonstrates superior performance compared to our previous work, achieving lower translation and rotation errors with a reduced number of points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11532v1-abstract-full').style.display = 'none'; document.getElementById('2409.11532v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10061">arXiv:2409.10061</a> <span> [<a href="https://arxiv.org/pdf/2409.10061">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Strongly Correlated Electrons">cond-mat.str-el</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> Magnetization dependent anisotropic topological properties in EuCuP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Yuan%2C+J">Jian Yuan</a>, <a href="/search/?searchtype=author&query=Shi%2C+X">Xianbiao Shi</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hong Du</a>, <a href="/search/?searchtype=author&query=Wang%2C+X">Xia Wang</a>, <a href="/search/?searchtype=author&query=Cheng%2C+J">Jinguang Cheng</a>, <a href="/search/?searchtype=author&query=Wang%2C+B">Baotian Wang</a>, <a href="/search/?searchtype=author&query=Zhong%2C+R">Ruidan Zhong</a>, <a href="/search/?searchtype=author&query=Zhang%2C+S">Shihao Zhang</a>, <a href="/search/?searchtype=author&query=Guo%2C+Y">Yanfeng Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10061v1-abstract-short" style="display: inline;"> The correlation between magnetism and nontrivial topological band structure serves as a unique venue for discovering exotic topological properties. Combining magnetotransport measurements and first-principles calculations, we unveil herein that the hexagonal EuCuP holds topologically trivial state in the paramagnetic structure, while strong magnetization dependent anisotropic topological states in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10061v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10061v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10061v1-abstract-full" style="display: none;"> The correlation between magnetism and nontrivial topological band structure serves as a unique venue for discovering exotic topological properties. Combining magnetotransport measurements and first-principles calculations, we unveil herein that the hexagonal EuCuP holds topologically trivial state in the paramagnetic structure, while strong magnetization dependent anisotropic topological states in the spin-polarization structures. Specifically, it hosts a trivial topological state in the in-plane spin-polarization structure, while a Weyl semimetal state in the out-of-plane spin-polarization structure. Our scaling analysis suggests that the intrinsic Berry curvature in the spin-polarization structures can account for the observed large anisotropic anomalous Hall effect. First-principles calculations show that the magnetization and the spin-orbit coupling simultaneously play essential roles for the appearance of the four pairs of Weyl points in the out-of-plane spin-polarization structure. Our work therefore establishes in EuCuP the intimate relation between magnetism and the nontrivial topological states, which would be instructive for future study on this key issue of topological physics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10061v1-abstract-full').style.display = 'none'; document.getElementById('2409.10061v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 papes, 5 figures and 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Physical Review Materials, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09585">arXiv:2409.09585</a> <span> [<a href="https://arxiv.org/pdf/2409.09585">pdf</a>, <a href="https://arxiv.org/format/2409.09585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> CSQF-based Time-Sensitive Flow Scheduling in Long-distance Industrial IoT Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Huang%2C+Y">Yudong Huang</a>, <a href="/search/?searchtype=author&query=Huang%2C+T">Tao Huang</a>, <a href="/search/?searchtype=author&query=Zhang%2C+X">Xinyuan Zhang</a>, <a href="/search/?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Yu%2C+F+R">Fei Richard Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09585v1-abstract-short" style="display: inline;"> Booming time-critical services, such as automated manufacturing and remote operations, stipulate increasing demands for facilitating large-scale Industrial Internet of Things (IoT). Recently, a cycle specified queuing and forwarding (CSQF) scheme has been advocated to enhance the Ethernet. However, CSQF only outlines a foundational equipment-level primitive, while how to attain network-wide flow s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09585v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09585v1-abstract-full" style="display: none;"> Booming time-critical services, such as automated manufacturing and remote operations, stipulate increasing demands for facilitating large-scale Industrial Internet of Things (IoT). Recently, a cycle specified queuing and forwarding (CSQF) scheme has been advocated to enhance the Ethernet. However, CSQF only outlines a foundational equipment-level primitive, while how to attain network-wide flow scheduling is not yet determined. Prior endeavors primarily focus on the range of a local area, rendering them unsuitable for long-distance factory interconnection. This paper devises the cycle tags planning (CTP) mechanism, the first integer programming model for the CSQF, which makes the CSQF practical for efficient global flow scheduling. In the CTP model, the per-hop cycle alignment problem is solved by decoupling the long-distance link delay from cyclic queuing time. To avoid queue overflows, we discretize the underlying network resources into cycle-related queue resource blocks and detail the core constraints within multiple periods. Then, two heuristic algorithms named flow offset and cycle shift (FO-CS) and Tabu FO-CS are designed to calculate the flows' cycle tags and maximize the number of schedulable flows, respectively. Evaluation results show that FO-CS increases the number of scheduled flows by 31.2%. The Tabu FO-CS algorithm can schedule 94.45% of flows at the level of 2000 flows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09585v1-abstract-full').style.display = 'none'; document.getElementById('2409.09585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09343">arXiv:2409.09343</a> <span> [<a href="https://arxiv.org/pdf/2409.09343">pdf</a>, <a href="https://arxiv.org/format/2409.09343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Generative AI in Data Center Networking: Fundamentals, Perspectives, and Case Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Liu%2C+Y">Yinqiu Liu</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Kang%2C+J">Jiawen Kang</a>, <a href="/search/?searchtype=author&query=Xiong%2C+Z">Zehui Xiong</a>, <a href="/search/?searchtype=author&query=Wen%2C+Y">Yonggang Wen</a>, <a href="/search/?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09343v1-abstract-short" style="display: inline;"> Generative AI (GenAI), exemplified by Large Language Models (LLMs) such as OpenAI's ChatGPT, is revolutionizing various fields. Central to this transformation is Data Center Networking (DCN), which not only provides the computational power necessary for GenAI training and inference but also delivers GenAI-driven services to users. This article examines an interplay between GenAI and DCNs, highligh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09343v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09343v1-abstract-full" style="display: none;"> Generative AI (GenAI), exemplified by Large Language Models (LLMs) such as OpenAI's ChatGPT, is revolutionizing various fields. Central to this transformation is Data Center Networking (DCN), which not only provides the computational power necessary for GenAI training and inference but also delivers GenAI-driven services to users. This article examines an interplay between GenAI and DCNs, highlighting their symbiotic relationship and mutual advancements. We begin by reviewing current challenges within DCNs and discuss how GenAI contributes to enhancing DCN capabilities through innovations, such as data augmentation, process automation, and domain transfer. We then focus on analyzing the distinctive characteristics of GenAI workloads on DCNs, gaining insights that catalyze the evolution of DCNs to more effectively support GenAI and LLMs. Moreover, to illustrate the seamless integration of GenAI with DCNs, we present a case study on full-lifecycle DCN digital twins. In this study, we employ LLMs equipped with Retrieval Augmented Generation (RAG) to formulate optimization problems for DCNs and adopt Diffusion-Deep Reinforcement Learning (DRL) for optimizing the RAG knowledge placement strategy. This approach not only demonstrates the application of advanced GenAI methods within DCNs but also positions the digital twin as a pivotal GenAI service operating on DCNs. We anticipate that this article can promote further research into enhancing the virtuous interaction between GenAI and DCNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09343v1-abstract-full').style.display = 'none'; document.getElementById('2409.09343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15037">arXiv:2408.15037</a> <span> [<a href="https://arxiv.org/pdf/2408.15037">pdf</a>, <a href="https://arxiv.org/format/2408.15037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evidence-Enhanced Triplet Generation Framework for Hallucination Alleviation in Generative Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Haowei Du</a>, <a href="/search/?searchtype=author&query=Zhang%2C+H">Huishuai Zhang</a>, <a href="/search/?searchtype=author&query=Zhao%2C+D">Dongyan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15037v1-abstract-short" style="display: inline;"> To address the hallucination in generative question answering (GQA) where the answer can not be derived from the document, we propose a novel evidence-enhanced triplet generation framework, EATQA, encouraging the model to predict all the combinations of (Question, Evidence, Answer) triplet by flipping the source pair and the target label to understand their logical relationships, i.e., predict Ans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15037v1-abstract-full').style.display = 'inline'; document.getElementById('2408.15037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15037v1-abstract-full" style="display: none;"> To address the hallucination in generative question answering (GQA) where the answer can not be derived from the document, we propose a novel evidence-enhanced triplet generation framework, EATQA, encouraging the model to predict all the combinations of (Question, Evidence, Answer) triplet by flipping the source pair and the target label to understand their logical relationships, i.e., predict Answer(A), Question(Q), and Evidence(E) given a QE, EA, and QA pairs, respectively. Furthermore, we bridge the distribution gap to distill the knowledge from evidence in inference stage. Our framework ensures the model to learn the logical relation between query, evidence and answer, which simultaneously improves the evidence generation and query answering. In this paper, we apply EATQA to LLama and it outperforms other LLMs-based methods and hallucination mitigation approaches on two challenging GQA benchmarks. Further analysis shows that our method not only keeps prior knowledge within LLM, but also mitigates hallucination and generates faithful answers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15037v1-abstract-full').style.display = 'none'; document.getElementById('2408.15037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13028">arXiv:2408.13028</a> <span> [<a href="https://arxiv.org/pdf/2408.13028">pdf</a>, <a href="https://arxiv.org/format/2408.13028">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> In-Context Learning with Reinforcement Learning for Incomplete Utterance Rewriting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Haowei Du</a>, <a href="/search/?searchtype=author&query=Zhao%2C+D">Dongyan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13028v1-abstract-short" style="display: inline;"> In-context learning (ICL) of large language models (LLMs) has attracted increasing attention in the community where LLMs make predictions only based on instructions augmented with a few examples. Existing example selection methods for ICL utilize sparse or dense retrievers and derive effective performance. However, these methods do not utilize direct feedback of LLM to train the retriever and the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13028v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13028v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13028v1-abstract-full" style="display: none;"> In-context learning (ICL) of large language models (LLMs) has attracted increasing attention in the community where LLMs make predictions only based on instructions augmented with a few examples. Existing example selection methods for ICL utilize sparse or dense retrievers and derive effective performance. However, these methods do not utilize direct feedback of LLM to train the retriever and the examples selected can not necessarily improve the analogy ability of LLM. To tackle this, we propose our policy-based reinforcement learning framework for example selection (RLS), which consists of a language model (LM) selector and an LLM generator. The LM selector encodes the candidate examples into dense representations and selects the top-k examples into the demonstration for LLM. The outputs of LLM are adopted to compute the reward and policy gradient to optimize the LM selector. We conduct experiments on different datasets and significantly outperform existing example selection methods. Moreover, our approach shows advantages over supervised finetuning (SFT) models in few shot setting. Further experiments show the balance of abundance and the similarity with the test case of examples is important for ICL performance of LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13028v1-abstract-full').style.display = 'none'; document.getElementById('2408.13028v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12979">arXiv:2408.12979</a> <span> [<a href="https://arxiv.org/pdf/2408.12979">pdf</a>, <a href="https://arxiv.org/format/2408.12979">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Internal and External Knowledge Interactive Refinement Framework for Knowledge-Intensive Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Du%2C+H">Haowei Du</a>, <a href="/search/?searchtype=author&query=Zhao%2C+D">Dongyan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12979v1-abstract-short" style="display: inline;"> Recent works have attempted to integrate external knowledge into LLMs to address the limitations and potential factual errors in LLM-generated content. However, how to retrieve the correct knowledge from the large amount of external knowledge imposes a challenge. To this end, we empirically observe that LLMs have already encoded rich knowledge in their pretrained parameters and utilizing these int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12979v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12979v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12979v1-abstract-full" style="display: none;"> Recent works have attempted to integrate external knowledge into LLMs to address the limitations and potential factual errors in LLM-generated content. However, how to retrieve the correct knowledge from the large amount of external knowledge imposes a challenge. To this end, we empirically observe that LLMs have already encoded rich knowledge in their pretrained parameters and utilizing these internal knowledge improves the retrieval of external knowledge when applying them to knowledge-intensive tasks. In this paper, we propose a new internal and external knowledge interactive refinement paradigm dubbed IEKR to utilize internal knowledge in LLM to help retrieve relevant knowledge from the external knowledge base, as well as exploit the external knowledge to refine the hallucination of generated internal knowledge. By simply adding a prompt like 'Tell me something about' to the LLMs, we try to review related explicit knowledge and insert them with the query into the retriever for external retrieval. The external knowledge is utilized to complement the internal knowledge into input of LLM for answers. We conduct experiments on 3 benchmark datasets in knowledge-intensive question answering task with different LLMs and domains, achieving the new state-of-the-art. Further analysis shows the effectiveness of different modules in our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12979v1-abstract-full').style.display = 'none'; document.getElementById('2408.12979v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11398">arXiv:2408.11398</a> <span> [<a href="https://arxiv.org/pdf/2408.11398">pdf</a>, <a href="https://arxiv.org/format/2408.11398">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Generative AI based Secure Wireless Sensing for ISAC Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Wang%2C+J">Jiacheng Wang</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Hongyang Du</a>, <a href="/search/?searchtype=author&query=Liu%2C+Y">Yinqiu Liu</a>, <a href="/search/?searchtype=author&query=Sun%2C+G">Geng Sun</a>, <a href="/search/?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a>, <a href="/search/?searchtype=author&query=Mao%2C+S">Shiwen Mao</a>, <a href="/search/?searchtype=author&query=Kim%2C+D+I">Dong In Kim</a>, <a href="/search/?searchtype=author&query=Shen%2C+X">Xuemin Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11398v1-abstract-short" style="display: inline;"> Integrated sensing and communications (ISAC) is expected to be a key technology for 6G, and channel state information (CSI) based sensing is a key component of ISAC. However, current research on ISAC focuses mainly on improving sensing performance, overlooking security issues, particularly the unauthorized sensing of users. In this paper, we propose a secure sensing system (DFSS) based on two dist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11398v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11398v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11398v1-abstract-full" style="display: none;"> Integrated sensing and communications (ISAC) is expected to be a key technology for 6G, and channel state information (CSI) based sensing is a key component of ISAC. However, current research on ISAC focuses mainly on improving sensing performance, overlooking security issues, particularly the unauthorized sensing of users. In this paper, we propose a secure sensing system (DFSS) based on two distinct diffusion models. Specifically, we first propose a discrete conditional diffusion model to generate graphs with nodes and edges, guiding the ISAC system to appropriately activate wireless links and nodes, which ensures the sensing performance while minimizing the operation cost. Using the activated links and nodes, DFSS then employs the continuous conditional diffusion model to generate safeguarding signals, which are next modulated onto the pilot at the transmitter to mask fluctuations caused by user activities. As such, only ISAC devices authorized with the safeguarding signals can extract the true CSI for sensing, while unauthorized devices are unable to achieve the same sensing. Experiment results demonstrate that DFSS can reduce the activity recognition accuracy of the unauthorized devices by approximately 70%, effectively shield the user from the unauthorized surveillance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11398v1-abstract-full').style.display = 'none'; document.getElementById('2408.11398v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.08684">arXiv:2408.08684</a> <span> [<a href="https://arxiv.org/pdf/2408.08684">pdf</a>, <a href="https://arxiv.org/format/2408.08684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Research on Personalized Compression Algorithm for Pre-trained Models Based on Homomorphic Entropy Increase </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/?searchtype=author&query=Li%2C+Y">Yicong Li</a>, <a href="/search/?searchtype=author&query=Guo%2C+X">Xing Guo</a>, <a href="/search/?searchtype=author&query=Du%2C+H">Haohua Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.08684v1-abstract-short" style="display: inline;"> In this article, we explore the challenges and evolution of two key technologies in the current field of AI: Vision Transformer model and Large Language Model (LLM). Vision Transformer captures global information by splitting images into small pieces and leveraging Transformer's multi-head attention mechanism, but its high reference count and compute overhead limit deployment on mobile devices. At… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08684v1-abstract-full').style.display = 'inline'; document.getElementById('2408.08684v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.08684v1-abstract-full" style="display: none;"> In this article, we explore the challenges and evolution of two key technologies in the current field of AI: Vision Transformer model and Large Language Model (LLM). Vision Transformer captures global information by splitting images into small pieces and leveraging Transformer's multi-head attention mechanism, but its high reference count and compute overhead limit deployment on mobile devices. At the same time, the rapid development of LLM has revolutionized natural language processing, but it also faces huge deployment challenges. To address these issues, we investigate model pruning techniques, with a particular focus on how to reduce redundant parameters without losing accuracy to accommodate personalized data and resource-constrained environments. In this paper, a new layered pruning strategy is proposed to distinguish the personalized layer from the common layer by compressed sensing and random sampling, thus significantly reducing the model parameters. Our experimental results show that the introduced step buffering mechanism further improves the accuracy of the model after pruning, providing new directions and possibilities for the deployment of efficient and personalized AI models on mobile devices in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.08684v1-abstract-full').style.display = 'none'; document.getElementById('2408.08684v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Du%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Du%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository