Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 140 results for author: <span class="mathjax">Pan, G</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Pan%2C+G">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Pan, G"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Pan%2C+G&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Pan, G"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Pan%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Pan%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Pan%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Pan%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13572">arXiv:2502.13572</a> <span> [<a href="https://arxiv.org/pdf/2502.13572">pdf</a>, <a href="https://arxiv.org/format/2502.13572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Improving the Sparse Structure Learning of Spiking Neural Networks from the View of Compression Efficiency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiangrong Shen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Badong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13572v1-abstract-short" style="display: inline;"> The human brain utilizes spikes for information transmission and dynamically reorganizes its network structure to boost energy efficiency and cognitive capabilities throughout its lifespan. Drawing inspiration from this spike-based computation, Spiking Neural Networks (SNNs) have been developed to construct event-driven models that emulate this efficiency. Despite these advances, deep SNNs continu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13572v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13572v1-abstract-full" style="display: none;"> The human brain utilizes spikes for information transmission and dynamically reorganizes its network structure to boost energy efficiency and cognitive capabilities throughout its lifespan. Drawing inspiration from this spike-based computation, Spiking Neural Networks (SNNs) have been developed to construct event-driven models that emulate this efficiency. Despite these advances, deep SNNs continue to suffer from over-parameterization during training and inference, a stark contrast to the brain's ability to self-organize. Furthermore, existing sparse SNNs are challenged by maintaining optimal pruning levels due to a static pruning ratio, resulting in either under- or over-pruning. In this paper, we propose a novel two-stage dynamic structure learning approach for deep SNNs, aimed at maintaining effective sparse training from scratch while optimizing compression efficiency. The first stage evaluates the compressibility of existing sparse subnetworks within SNNs using the PQ index, which facilitates an adaptive determination of the rewiring ratio for synaptic connections based on data compression insights. In the second stage, this rewiring ratio critically informs the dynamic synaptic connection rewiring process, including both pruning and regrowth. This approach significantly improves the exploration of sparse structure training in deep SNNs, adapting sparsity dynamically from the point view of compression efficiency. Our experiments demonstrate that this sparse training approach not only aligns with the performance of current deep SNNs models but also significantly improves the efficiency of compressing sparse SNNs. Crucially, it preserves the advantages of initiating training with sparse models and offers a promising solution for implementing edge AI on neuromorphic hardware. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13572v1-abstract-full').style.display = 'none'; document.getElementById('2502.13572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09449">arXiv:2502.09449</a> <span> [<a href="https://arxiv.org/pdf/2502.09449">pdf</a>, <a href="https://arxiv.org/format/2502.09449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Spiking Neural Networks for Temporal Processing: Status Quo and Future Prospects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenxiang Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanchen Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qu Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yujie Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+K+C">Kay Chen Tan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jibin Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09449v1-abstract-short" style="display: inline;"> Temporal processing is fundamental for both biological and artificial intelligence systems, as it enables the comprehension of dynamic environments and facilitates timely responses. Spiking Neural Networks (SNNs) excel in handling such data with high efficiency, owing to their rich neuronal dynamics and sparse activity patterns. Given the recent surge in the development of SNNs, there is an urgent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09449v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09449v1-abstract-full" style="display: none;"> Temporal processing is fundamental for both biological and artificial intelligence systems, as it enables the comprehension of dynamic environments and facilitates timely responses. Spiking Neural Networks (SNNs) excel in handling such data with high efficiency, owing to their rich neuronal dynamics and sparse activity patterns. Given the recent surge in the development of SNNs, there is an urgent need for a comprehensive evaluation of their temporal processing capabilities. In this paper, we first conduct an in-depth assessment of commonly used neuromorphic benchmarks, revealing critical limitations in their ability to evaluate the temporal processing capabilities of SNNs. To bridge this gap, we further introduce a benchmark suite consisting of three temporal processing tasks characterized by rich temporal dynamics across multiple timescales. Utilizing this benchmark suite, we perform a thorough evaluation of recently introduced SNN approaches to elucidate the current status of SNNs in temporal processing. Our findings indicate significant advancements in recently developed spiking neuron models and neural architectures regarding their temporal processing capabilities, while also highlighting a performance gap in handling long-range dependencies when compared to state-of-the-art non-spiking models. Finally, we discuss the key challenges and outline potential avenues for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09449v1-abstract-full').style.display = 'none'; document.getElementById('2502.09449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00983">arXiv:2502.00983</a> <span> [<a href="https://arxiv.org/pdf/2502.00983">pdf</a>, <a href="https://arxiv.org/format/2502.00983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> CausalCOMRL: Context-Based Offline Meta-Reinforcement Learning with Causal Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengzhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+W">Wenjia Meng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoliang Sun</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00983v1-abstract-short" style="display: inline;"> Context-based offline meta-reinforcement learning (OMRL) methods have achieved appealing success by leveraging pre-collected offline datasets to develop task representations that guide policy learning. However, current context-based OMRL methods often introduce spurious correlations, where task components are incorrectly correlated due to confounders. These correlations can degrade policy performa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00983v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00983v1-abstract-full" style="display: none;"> Context-based offline meta-reinforcement learning (OMRL) methods have achieved appealing success by leveraging pre-collected offline datasets to develop task representations that guide policy learning. However, current context-based OMRL methods often introduce spurious correlations, where task components are incorrectly correlated due to confounders. These correlations can degrade policy performance when the confounders in the test task differ from those in the training task. To address this problem, we propose CausalCOMRL, a context-based OMRL method that integrates causal representation learning. This approach uncovers causal relationships among the task components and incorporates the causal relationships into task representations, enhancing the generalizability of RL agents. We further improve the distinction of task representations from different tasks by using mutual information optimization and contrastive learning. Utilizing these causal task representations, we employ SAC to optimize policies on meta-RL benchmarks. Experimental results show that CausalCOMRL achieves better performance than other methods on most benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00983v1-abstract-full').style.display = 'none'; document.getElementById('2502.00983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00345">arXiv:2502.00345</a> <span> [<a href="https://arxiv.org/pdf/2502.00345">pdf</a>, <a href="https://arxiv.org/format/2502.00345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> The Composite Task Challenge for Cooperative Multi-Agent Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yurui Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00345v1-abstract-short" style="display: inline;"> The significant role of division of labor (DOL) in promoting cooperation is widely recognized in real-world applications.Many cooperative multi-agent reinforcement learning (MARL) methods have incorporated the concept of DOL to improve cooperation among agents.However, the tasks used in existing testbeds typically correspond to tasks where DOL is often not a necessary feature for achieving optimal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00345v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00345v1-abstract-full" style="display: none;"> The significant role of division of labor (DOL) in promoting cooperation is widely recognized in real-world applications.Many cooperative multi-agent reinforcement learning (MARL) methods have incorporated the concept of DOL to improve cooperation among agents.However, the tasks used in existing testbeds typically correspond to tasks where DOL is often not a necessary feature for achieving optimal policies.Additionally, the full utilize of DOL concept in MARL methods remains unrealized due to the absence of appropriate tasks.To enhance the generality and applicability of MARL methods in real-world scenarios, there is a necessary to develop tasks that demand multi-agent DOL and cooperation.In this paper, we propose a series of tasks designed to meet these requirements, drawing on real-world rules as the guidance for their design.We guarantee that DOL and cooperation are necessary condition for completing tasks and introduce three factors to expand the diversity of proposed tasks to cover more realistic situations.We evaluate 10 cooperative MARL methods on the proposed tasks.The results indicate that all baselines perform poorly on these tasks.To further validate the solvability of these tasks, we also propose simplified variants of proposed tasks.Experimental results show that baselines are able to handle these simplified variants, providing evidence of the solvability of the proposed tasks.The source files is available at https://github.com/Yurui-Li/CTC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00345v1-abstract-full').style.display = 'none'; document.getElementById('2502.00345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14970">arXiv:2501.14970</a> <span> [<a href="https://arxiv.org/pdf/2501.14970">pdf</a>, <a href="https://arxiv.org/format/2501.14970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AI-driven Wireless Positioning: Fundamentals, Standards, State-of-the-art, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yilin Gao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Z">Zhiyong Zhong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaoyu Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xinyu Guo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14970v1-abstract-short" style="display: inline;"> Wireless positioning technologies hold significant value for applications in autonomous driving, extended reality (XR), unmanned aerial vehicles (UAVs), and more. With the advancement of artificial intelligence (AI), leveraging AI to enhance positioning accuracy and robustness has emerged as a field full of potential. Driven by the requirements and functionalities defined in the 3rd Generation Par… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14970v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14970v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14970v1-abstract-full" style="display: none;"> Wireless positioning technologies hold significant value for applications in autonomous driving, extended reality (XR), unmanned aerial vehicles (UAVs), and more. With the advancement of artificial intelligence (AI), leveraging AI to enhance positioning accuracy and robustness has emerged as a field full of potential. Driven by the requirements and functionalities defined in the 3rd Generation Partnership Project (3GPP) standards, AI/machine learning (ML)-based positioning is becoming a key technology to overcome the limitations of traditional methods. This paper begins with an introduction to the fundamentals of AI and wireless positioning, covering AI models, algorithms, positioning applications, emerging wireless technologies, and the basics of positioning techniques. Subsequently, focusing on standardization progress, we provide a comprehensive review of the evolution of 3GPP positioning standards, with an emphasis on the integration of AI/ML technologies in recent and upcoming releases. Based on the AI/ML-assisted positioning and direct AI/ML positioning schemes outlined in the standards, we conduct an in-depth investigation of related research. we focus on state-of-the-art (SOTA) research in AI-based line-of-sight (LOS)/non-line-of-sight (NLOS) detection, time of arrival (TOA)/time difference of arrival (TDOA) estimation, and angle estimation techniques. For Direct AI/ML Positioning, we explore SOTA advancements in fingerprint-based positioning, knowledge-assisted AI positioning, and channel charting-based positioning. Furthermore, we introduce publicly available datasets for wireless positioning and conclude by summarizing the challenges and opportunities of AI-driven wireless positioning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14970v1-abstract-full').style.display = 'none'; document.getElementById('2501.14970v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">32 pages. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02572">arXiv:2501.02572</a> <span> [<a href="https://arxiv.org/pdf/2501.02572">pdf</a>, <a href="https://arxiv.org/format/2501.02572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Energy Optimization of Multi-task DNN Inference in MEC-assisted XR Devices: A Lyapunov-Guided Reinforcement Learning Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanzan Sun</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Jiacheng Qiu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyun Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Shuangfeng Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02572v1-abstract-short" style="display: inline;"> Extended reality (XR), blending virtual and real worlds, is a key application of future networks. While AI advancements enhance XR capabilities, they also impose significant computational and energy challenges on lightweight XR devices. In this paper, we developed a distributed queue model for multi-task DNN inference, addressing issues of resource competition and queue coupling. In response to th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02572v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02572v1-abstract-full" style="display: none;"> Extended reality (XR), blending virtual and real worlds, is a key application of future networks. While AI advancements enhance XR capabilities, they also impose significant computational and energy challenges on lightweight XR devices. In this paper, we developed a distributed queue model for multi-task DNN inference, addressing issues of resource competition and queue coupling. In response to the challenges posed by the high energy consumption and limited resources of XR devices, we designed a dual time-scale joint optimization strategy for model partitioning and resource allocation, formulated as a bi-level optimization problem. This strategy aims to minimize the total energy consumption of XR devices while ensuring queue stability and adhering to computational and communication resource constraints. To tackle this problem, we devised a Lyapunov-guided Proximal Policy Optimization algorithm, named LyaPPO. Numerical results demonstrate that the LyaPPO algorithm outperforms the baselines, achieving energy conservation of 24.79% to 46.14% under varying resource capacities. Specifically, the proposed algorithm reduces the energy consumption of XR devices by 24.29% to 56.62% compared to baseline algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02572v1-abstract-full').style.display = 'none'; document.getElementById('2501.02572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 7 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19055">arXiv:2412.19055</a> <span> [<a href="https://arxiv.org/pdf/2412.19055">pdf</a>, <a href="https://arxiv.org/format/2412.19055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SpectralKD: A Unified Framework for Interpreting and Distilling Vision Transformers via Spectral Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+H">Huiyuan Tian</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+B">Bonan Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19055v3-abstract-short" style="display: inline;"> Knowledge Distillation (KD) has achieved widespread success in compressing large Vision Transformers (ViTs), but a unified theoretical framework for both ViTs and KD is still lacking. In this paper, we propose SpectralKD, a novel unified analytical framework that offers deeper insights into ViTs and optimizes KD via spectral analysis. Our model-wise analysis reveals that CaiT concentrates informat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19055v3-abstract-full').style.display = 'inline'; document.getElementById('2412.19055v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19055v3-abstract-full" style="display: none;"> Knowledge Distillation (KD) has achieved widespread success in compressing large Vision Transformers (ViTs), but a unified theoretical framework for both ViTs and KD is still lacking. In this paper, we propose SpectralKD, a novel unified analytical framework that offers deeper insights into ViTs and optimizes KD via spectral analysis. Our model-wise analysis reveals that CaiT concentrates information in their first and last few layers, informing optimal layer selection for KD. Surprisingly, our layer-wise analysis discovers that Swin Transformer and CaiT exhibit similar spectral encoding patterns despite their architectural differences, leading to feature map alignment guideline. Building on these insights, we propose a simple yet effective spectral alignment method for KD. Benefiting from the deeper understanding by above analysis results, even such a simple strategy achieves state-of-the-art performance on ImageNet-1K without introducing any trainable parameters, improving DeiT-Tiny by $+5.2\%$ and Swin-Tiny by $+1.4\%$ in top-1 accuracy. Furthermore, our post-training analysis reveals that distilled students can reproduce spectral patterns similar to their teachers, opening a new area we term ``distillation dynamics". Code and experimental logs are available in https://github.com/thy960112/SpectralKD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19055v3-abstract-full').style.display = 'none'; document.getElementById('2412.19055v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15634">arXiv:2412.15634</a> <span> [<a href="https://arxiv.org/pdf/2412.15634">pdf</a>, <a href="https://arxiv.org/format/2412.15634">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Darkit: A User-Friendly Software Toolkit for Spiking Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+X">Xin Du</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shifan Ye</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yangfan Hu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Shunyu Qi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuyang Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+S">Shuiguang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15634v1-abstract-short" style="display: inline;"> Large language models (LLMs) have been widely applied in various practical applications, typically comprising billions of parameters, with inference processes requiring substantial energy and computational resources. In contrast, the human brain, employing bio-plausible spiking mechanisms, can accomplish the same tasks while significantly reducing energy consumption, even with a similar number of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15634v1-abstract-full').style.display = 'inline'; document.getElementById('2412.15634v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15634v1-abstract-full" style="display: none;"> Large language models (LLMs) have been widely applied in various practical applications, typically comprising billions of parameters, with inference processes requiring substantial energy and computational resources. In contrast, the human brain, employing bio-plausible spiking mechanisms, can accomplish the same tasks while significantly reducing energy consumption, even with a similar number of parameters. Based on this, several pioneering researchers have proposed and implemented various large language models that leverage spiking neural networks. They have demonstrated the feasibility of these models, validated their performance, and open-sourced their frameworks and partial source code. To accelerate the adoption of brain-inspired large language models and facilitate secondary development for researchers, we are releasing a software toolkit named DarwinKit (Darkit). The toolkit is designed specifically for learners, researchers, and developers working on spiking large models, offering a suite of highly user-friendly features that greatly simplify the learning, deployment, and development processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15634v1-abstract-full').style.display = 'none'; document.getElementById('2412.15634v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12159">arXiv:2412.12159</a> <span> [<a href="https://arxiv.org/pdf/2412.12159">pdf</a>, <a href="https://arxiv.org/format/2412.12159">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personalized Sleep Staging Leveraging Source-free Unsupervised Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yangxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sha Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiquan Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiteng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+h">hijian Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+B">Benyan Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12159v1-abstract-short" style="display: inline;"> Sleep staging is crucial for assessing sleep quality and diagnosing related disorders. Recent deep learning models for automatic sleep staging using polysomnography often suffer from poor generalization to new subjects because they are trained and tested on the same labeled datasets, overlooking individual differences. To tackle this issue, we propose a novel Source-Free Unsupervised Individual Do… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12159v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12159v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12159v1-abstract-full" style="display: none;"> Sleep staging is crucial for assessing sleep quality and diagnosing related disorders. Recent deep learning models for automatic sleep staging using polysomnography often suffer from poor generalization to new subjects because they are trained and tested on the same labeled datasets, overlooking individual differences. To tackle this issue, we propose a novel Source-Free Unsupervised Individual Domain Adaptation (SF-UIDA) framework. This two-step adaptation scheme allows the model to effectively adjust to new unlabeled individuals without needing source data, facilitating personalized customization in clinical settings. Our framework has been applied to three established sleep staging models and tested on three public datasets, achieving state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12159v1-abstract-full').style.display = 'none'; document.getElementById('2412.12159v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11812">arXiv:2412.11812</a> <span> [<a href="https://arxiv.org/pdf/2412.11812">pdf</a>, <a href="https://arxiv.org/format/2412.11812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLDA-YOLO: Visual Contrastive Learning Based Domain Adaptive YOLO Detector </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+T">Tianheng Qiu</a>, <a href="/search/cs?searchtype=author&query=Law%2C+K+L">Ka Lung Law</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guanghua Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jufei Wang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuan Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11812v1-abstract-short" style="display: inline;"> Unsupervised domain adaptive (UDA) algorithms can markedly enhance the performance of object detectors under conditions of domain shifts, thereby reducing the necessity for extensive labeling and retraining. Current domain adaptive object detection algorithms primarily cater to two-stage detectors, which tend to offer minimal improvements when directly applied to single-stage detectors such as YOL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11812v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11812v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11812v1-abstract-full" style="display: none;"> Unsupervised domain adaptive (UDA) algorithms can markedly enhance the performance of object detectors under conditions of domain shifts, thereby reducing the necessity for extensive labeling and retraining. Current domain adaptive object detection algorithms primarily cater to two-stage detectors, which tend to offer minimal improvements when directly applied to single-stage detectors such as YOLO. Intending to benefit the YOLO detector from UDA, we build a comprehensive domain adaptive architecture using a teacher-student cooperative system for the YOLO detector. In this process, we propose uncertainty learning to cope with pseudo-labeling generated by the teacher model with extreme uncertainty and leverage dynamic data augmentation to asymptotically adapt the teacher-student system to the environment. To address the inability of single-stage object detectors to align at multiple stages, we utilize a unified visual contrastive learning paradigm that aligns instance at backbone and head respectively, which steadily improves the robustness of the detectors in cross-domain tasks. In summary, we present an unsupervised domain adaptive YOLO detector based on visual contrastive learning (CLDA-YOLO), which achieves highly competitive results across multiple domain adaptive datasets without any reduction in inference speed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11812v1-abstract-full').style.display = 'none'; document.getElementById('2412.11812v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11138">arXiv:2412.11138</a> <span> [<a href="https://arxiv.org/pdf/2412.11138">pdf</a>, <a href="https://arxiv.org/format/2412.11138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Safe Reinforcement Learning using Finite-Horizon Gradient-based Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dai%2C+J">Juntao Dai</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaodong Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11138v1-abstract-short" style="display: inline;"> A key aspect of Safe Reinforcement Learning (Safe RL) involves estimating the constraint condition for the next policy, which is crucial for guiding the optimization of safe policy updates. However, the existing Advantage-based Estimation (ABE) method relies on the infinite-horizon discounted advantage function. This dependence leads to catastrophic errors in finite-horizon scenarios with non-disc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11138v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11138v1-abstract-full" style="display: none;"> A key aspect of Safe Reinforcement Learning (Safe RL) involves estimating the constraint condition for the next policy, which is crucial for guiding the optimization of safe policy updates. However, the existing Advantage-based Estimation (ABE) method relies on the infinite-horizon discounted advantage function. This dependence leads to catastrophic errors in finite-horizon scenarios with non-discounted constraints, resulting in safety-violation updates. In response, we propose the first estimation method for finite-horizon non-discounted constraints in deep Safe RL, termed Gradient-based Estimation (GBE), which relies on the analytic gradient derived along trajectories. Our theoretical and empirical analyses demonstrate that GBE can effectively estimate constraint changes over a finite horizon. Constructing a surrogate optimization problem with GBE, we developed a novel Safe RL algorithm called Constrained Gradient-based Policy Optimization (CGPO). CGPO identifies feasible optimal policies by iteratively resolving sub-problems within trust regions. Our empirical results reveal that CGPO, unlike baseline algorithms, successfully estimates the constraint functions of subsequent policies, thereby ensuring the efficiency and feasibility of each update. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11138v1-abstract-full').style.display = 'none'; document.getElementById('2412.11138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 41st International Conference on Machine Learning, PMLR 235:9872-9903, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09849">arXiv:2412.09849</a> <span> [<a href="https://arxiv.org/pdf/2412.09849">pdf</a>, <a href="https://arxiv.org/ps/2412.09849">ps</a>, <a href="https://arxiv.org/format/2412.09849">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning for Spectrum Prediction in Cognitive Radio Networks: State-of-the-Art, New Opportunities, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangliang Pan</a>, <a href="/search/cs?searchtype=author&query=Yau%2C+D+K+Y">David K. Y. Yau</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qihui Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09849v1-abstract-short" style="display: inline;"> Spectrum prediction is considered to be a promising technology that enhances spectrum efficiency by assisting dynamic spectrum access (DSA) in cognitive radio networks (CRN). Nonetheless, the highly nonlinear nature of spectrum data across time, frequency, and space domains, coupled with the intricate spectrum usage patterns, poses challenges for accurate spectrum prediction. Deep learning (DL), r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09849v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09849v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09849v1-abstract-full" style="display: none;"> Spectrum prediction is considered to be a promising technology that enhances spectrum efficiency by assisting dynamic spectrum access (DSA) in cognitive radio networks (CRN). Nonetheless, the highly nonlinear nature of spectrum data across time, frequency, and space domains, coupled with the intricate spectrum usage patterns, poses challenges for accurate spectrum prediction. Deep learning (DL), recognized for its capacity to extract nonlinear features, has been applied to solve these challenges. This paper first shows the advantages of applying DL by comparing with traditional prediction methods. Then, the current state-of-the-art DL-based spectrum prediction techniques are reviewed and summarized in terms of intra-band and crossband prediction. Notably, this paper uses a real-world spectrum dataset to prove the advancements of DL-based methods. Then, this paper proposes a novel intra-band spatiotemporal spectrum prediction framework named ViTransLSTM. This framework integrates visual self-attention and long short-term memory to capture both local and global long-term spatiotemporal dependencies of spectrum usage patterns. Similarly, the effectiveness of the proposed framework is validated on the aforementioned real-world dataset. Finally, the paper presents new related challenges and potential opportunities for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09849v1-abstract-full').style.display = 'none'; document.getElementById('2412.09849v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07236">arXiv:2412.07236</a> <span> [<a href="https://arxiv.org/pdf/2412.07236">pdf</a>, <a href="https://arxiv.org/format/2412.07236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> CBraMod: A Criss-Cross Brain Foundation Model for EEG Decoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiquan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sha Zhao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhiling Luo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yangxuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiteng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07236v2-abstract-short" style="display: inline;"> Electroencephalography (EEG) is a non-invasive technique to measure and record brain electrical activity, widely used in various BCI and healthcare applications. Early EEG decoding methods rely on supervised learning, limited by specific tasks and datasets, hindering model performance and generalizability. With the success of large language models, there is a growing body of studies focusing on EE… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07236v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07236v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07236v2-abstract-full" style="display: none;"> Electroencephalography (EEG) is a non-invasive technique to measure and record brain electrical activity, widely used in various BCI and healthcare applications. Early EEG decoding methods rely on supervised learning, limited by specific tasks and datasets, hindering model performance and generalizability. With the success of large language models, there is a growing body of studies focusing on EEG foundation models. However, these studies still leave challenges: Firstly, most of existing EEG foundation models employ full EEG modeling strategy. It models the spatial and temporal dependencies between all EEG patches together, but ignores that the spatial and temporal dependencies are heterogeneous due to the unique structural characteristics of EEG signals. Secondly, existing EEG foundation models have limited generalizability on a wide range of downstream BCI tasks due to varying formats of EEG data, making it challenging to adapt to. To address these challenges, we propose a novel foundation model called CBraMod. Specifically, we devise a criss-cross transformer as the backbone to thoroughly leverage the structural characteristics of EEG signals, which can model spatial and temporal dependencies separately through two parallel attention mechanisms. And we utilize an asymmetric conditional positional encoding scheme which can encode positional information of EEG patches and be easily adapted to the EEG with diverse formats. CBraMod is pre-trained on a very large corpus of EEG through patch-based masked EEG reconstruction. We evaluate CBraMod on up to 10 downstream BCI tasks (12 public datasets). CBraMod achieves the state-of-the-art performance across the wide range of tasks, proving its strong capability and generalizability. The source code is publicly available at https://github.com/wjq-learning/CBraMod. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07236v2-abstract-full').style.display = 'none'; document.getElementById('2412.07236v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The Thirteenth International Conference on Learning Representations (ICLR 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06720">arXiv:2412.06720</a> <span> [<a href="https://arxiv.org/pdf/2412.06720">pdf</a>, <a href="https://arxiv.org/format/2412.06720">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VP-MEL: Visual Prompts Guided Multimodal Entity Linking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mi%2C+H">Hongze Mi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuying Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Haoran Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Di Sun</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06720v4-abstract-short" style="display: inline;"> Multimodal entity linking (MEL), a task aimed at linking mentions within multimodal contexts to their corresponding entities in a knowledge base (KB), has attracted much attention due to its wide applications in recent years. However, existing MEL methods often rely on mention words as retrieval cues, which limits their ability to effectively utilize information from both images and text. This rel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06720v4-abstract-full').style.display = 'inline'; document.getElementById('2412.06720v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06720v4-abstract-full" style="display: none;"> Multimodal entity linking (MEL), a task aimed at linking mentions within multimodal contexts to their corresponding entities in a knowledge base (KB), has attracted much attention due to its wide applications in recent years. However, existing MEL methods often rely on mention words as retrieval cues, which limits their ability to effectively utilize information from both images and text. This reliance causes MEL to struggle with accurately retrieving entities in certain scenarios, especially when the focus is on image objects or mention words are missing from the text. To solve these issues, we introduce a Visual Prompts guided Multimodal Entity Linking (VP-MEL) task. Given a text-image pair, VP-MEL aims to link a marked region (i.e., visual prompt) in an image to its corresponding entities in the knowledge base. To facilitate this task, we present a new dataset, VPWiki, specifically designed for VP-MEL. Furthermore, we propose a framework named IIER, which enhances visual feature extraction using visual prompts and leverages the pretrained Detective-VLM model to capture latent information. Experimental results on the VPWiki dataset demonstrate that IIER outperforms baseline methods across multiple benchmarks for the VP-MEL task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06720v4-abstract-full').style.display = 'none'; document.getElementById('2412.06720v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03957">arXiv:2412.03957</a> <span> [<a href="https://arxiv.org/pdf/2412.03957">pdf</a>, <a href="https://arxiv.org/format/2412.03957">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Framework For Image Synthesis Using Supervised Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yibin Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03957v1-abstract-short" style="display: inline;"> Text-to-image (T2I) generation aims at producing realistic images corresponding to text descriptions. Generative Adversarial Network (GAN) has proven to be successful in this task. Typical T2I GANs are 2 phase methods that first pretrain an inter-modal representation from aligned image-text pairs and then use GAN to train image generator on that basis. However, such representation ignores the inne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03957v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03957v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03957v1-abstract-full" style="display: none;"> Text-to-image (T2I) generation aims at producing realistic images corresponding to text descriptions. Generative Adversarial Network (GAN) has proven to be successful in this task. Typical T2I GANs are 2 phase methods that first pretrain an inter-modal representation from aligned image-text pairs and then use GAN to train image generator on that basis. However, such representation ignores the inner-modal semantic correspondence, e.g. the images with same label. The semantic label in priory describes the inherent distribution pattern with underlying cross-image relationships, which is supplement to the text description for understanding the full characteristics of image. In this paper, we propose a framework leveraging both inter- and inner-modal correspondence by label guided supervised contrastive learning. We extend the T2I GANs to two parameter-sharing contrast branches in both pretraining and generation phases. This integration effectively clusters the semantically similar image-text pair representations, thereby fostering the generation of higher-quality images. We demonstrate our framework on four novel T2I GANs by both single-object dataset CUB and multi-object dataset COCO, achieving significant improvements in the Inception Score (IS) and Frechet Inception Distance (FID) metrics of imagegeneration evaluation. Notably, on more complex multi-object COCO, our framework improves FID by 30.1%, 27.3%, 16.2% and 17.1% for AttnGAN, DM-GAN, SSA-GAN and GALIP, respectively. We also validate our superiority by comparing with other label guided T2I GANs. The results affirm the effectiveness and competitiveness of our approach in advancing the state-of-the-art GAN for T2I generation <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03957v1-abstract-full').style.display = 'none'; document.getElementById('2412.03957v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17045">arXiv:2411.17045</a> <span> [<a href="https://arxiv.org/pdf/2411.17045">pdf</a>, <a href="https://arxiv.org/format/2411.17045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Redefining Crowdsourced Test Report Prioritization: An Innovative Approach with Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+Y">Yuchen Ling</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Shengcheng Yu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guobin Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jun Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17045v1-abstract-short" style="display: inline;"> Context: Crowdsourced testing has gained popularity in software testing, especially for mobile app testing, due to its ability to bring diversity and tackle fragmentation issues. However, the openness of crowdsourced testing presents challenges, particularly in the manual review of numerous test reports, which is time-consuming and labor-intensive. Objective: The primary goal of this research is t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17045v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17045v1-abstract-full" style="display: none;"> Context: Crowdsourced testing has gained popularity in software testing, especially for mobile app testing, due to its ability to bring diversity and tackle fragmentation issues. However, the openness of crowdsourced testing presents challenges, particularly in the manual review of numerous test reports, which is time-consuming and labor-intensive. Objective: The primary goal of this research is to improve the efficiency of review processes in crowdsourced testing. Traditional approaches to test report prioritization lack a deep understanding of semantic information in textual descriptions of these reports. This paper introduces LLMPrior, a novel approach for prioritizing crowdsourced test reports using large language models (LLMs). Method: LLMPrior leverages LLMs for the analysis and clustering of crowdsourced test reports based on the types of bugs revealed in their textual descriptions. This involves using prompt engineering techniques to enhance the performance of LLMs. Following the clustering, a recurrent selection algorithm is applied to prioritize the reports. Results: Empirical experiments are conducted to evaluate the effectiveness of LLMPrior. The findings indicate that LLMPrior not only surpasses current state-of-the-art approaches in terms of performance but also proves to be more feasible, efficient, and reliable. This success is attributed to the use of prompt engineering techniques and the cluster-based prioritization strategy. Conclusion: LLMPrior represents a significant advancement in crowdsourced test report prioritization. By effectively utilizing large language models and a cluster-based strategy, it addresses the challenges in traditional prioritization approaches, offering a more efficient and reliable solution for app developers dealing with crowdsourced test reports. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17045v1-abstract-full').style.display = 'none'; document.getElementById('2411.17045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Information and Software Technology in Nov 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15885">arXiv:2410.15885</a> <span> [<a href="https://arxiv.org/pdf/2410.15885">pdf</a>, <a href="https://arxiv.org/format/2410.15885">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> How to Build a Pre-trained Multimodal model for Simultaneously Chatting and Decision-making? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Z">Zuojin Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+B">Bin Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">De Ma</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15885v1-abstract-short" style="display: inline;"> Existing large pre-trained models typically map text input to text output in an end-to-end manner, such as ChatGPT, or map a segment of text input to a hierarchy of action decisions, such as OpenVLA. However, humans can simultaneously generate text and actions when receiving specific input signals. For example, a driver can make precise driving decisions while conversing with a friend in the passe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15885v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15885v1-abstract-full" style="display: none;"> Existing large pre-trained models typically map text input to text output in an end-to-end manner, such as ChatGPT, or map a segment of text input to a hierarchy of action decisions, such as OpenVLA. However, humans can simultaneously generate text and actions when receiving specific input signals. For example, a driver can make precise driving decisions while conversing with a friend in the passenger seat. Motivated by this observation, we consider the following question in this work: is it possible to construct a pre-trained model that can provide both language interaction and precise decision-making capabilities in dynamic open scenarios. We provide a definitive answer to this question by developing a new model architecture termed Visual Language Action model for Chatting and Decision Making (VLA4CD), and further demonstrating its performance in challenging autonomous driving tasks. Specifically, we leverage LoRA to fine-tune a pre-trained LLM with data of multiple modalities covering language, visual, and action. Unlike the existing LoRA operations used for LLM fine-tuning, we have designed new computational modules and training cost functions for VLA4CD. These designs enable VLA4CD to provide continuous-valued action decisions while outputting text responses. In contrast, existing LLMs can only output text responses, and current VLA models can only output action decisions. Moreover, these VLA models handle action data by discretizing and then tokenizing the discretized actions, a method unsuitable for complex decision-making tasks involving high-dimensional continuous-valued action vectors, such as autonomous driving. The experimental results on CARLA validate that: (1) our proposed model construction method is effective; (2) compared to the SOTA VLA model, VLA4CD can provide more accurate real-time decision-making while retaining the text interaction capability inherent to LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15885v1-abstract-full').style.display = 'none'; document.getElementById('2410.15885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15689">arXiv:2410.15689</a> <span> [<a href="https://arxiv.org/pdf/2410.15689">pdf</a>, <a href="https://arxiv.org/format/2410.15689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Enhancing SNN-based Spatio-Temporal Learning: A Benchmark Dataset and Cross-Modality Attention Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shibo Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+M">Mengwen Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Runhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Rui Yan</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15689v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs), renowned for their low power consumption, brain-inspired architecture, and spatio-temporal representation capabilities, have garnered considerable attention in recent years. Similar to Artificial Neural Networks (ANNs), high-quality benchmark datasets are of great importance to the advances of SNNs. However, our analysis indicates that many prevalent neuromorphic da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15689v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15689v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs), renowned for their low power consumption, brain-inspired architecture, and spatio-temporal representation capabilities, have garnered considerable attention in recent years. Similar to Artificial Neural Networks (ANNs), high-quality benchmark datasets are of great importance to the advances of SNNs. However, our analysis indicates that many prevalent neuromorphic datasets lack strong temporal correlation, preventing SNNs from fully exploiting their spatio-temporal representation capabilities. Meanwhile, the integration of event and frame modalities offers more comprehensive visual spatio-temporal information. Yet, the SNN-based cross-modality fusion remains underexplored. In this work, we present a neuromorphic dataset called DVS-SLR that can better exploit the inherent spatio-temporal properties of SNNs. Compared to existing datasets, it offers advantages in terms of higher temporal correlation, larger scale, and more varied scenarios. In addition, our neuromorphic dataset contains corresponding frame data, which can be used for developing SNN-based fusion methods. By virtue of the dual-modal feature of the dataset, we propose a Cross-Modality Attention (CMA) based fusion method. The CMA model efficiently utilizes the unique advantages of each modality, allowing for SNNs to learn both temporal and spatial attention scores from the spatio-temporal features of event and frame modalities, subsequently allocating these scores across modalities to enhance their synergy. Experimental results demonstrate that our method not only improves recognition accuracy but also ensures robustness across diverse scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15689v1-abstract-full').style.display = 'none'; document.getElementById('2410.15689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07266">arXiv:2410.07266</a> <span> [<a href="https://arxiv.org/pdf/2410.07266">pdf</a>, <a href="https://arxiv.org/format/2410.07266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spiking GS: Towards High-Accuracy and Low-Cost Surface Reconstruction via Spiking Neuron-based Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weixing Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongrui Li</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">De Ma</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xudong Jiang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07266v5-abstract-short" style="display: inline;"> 3D Gaussian Splatting is capable of reconstructing 3D scenes in minutes. Despite recent advances in improving surface reconstruction accuracy, the reconstructed results still exhibit bias and suffer from inefficiency in storage and training. This paper provides a different observation on the cause of the inefficiency and the reconstruction bias, which is attributed to the integration of the low-op… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07266v5-abstract-full').style.display = 'inline'; document.getElementById('2410.07266v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07266v5-abstract-full" style="display: none;"> 3D Gaussian Splatting is capable of reconstructing 3D scenes in minutes. Despite recent advances in improving surface reconstruction accuracy, the reconstructed results still exhibit bias and suffer from inefficiency in storage and training. This paper provides a different observation on the cause of the inefficiency and the reconstruction bias, which is attributed to the integration of the low-opacity parts (LOPs) of the generated Gaussians. We show that LOPs consist of Gaussians with overall low-opacity (LOGs) and the low-opacity tails (LOTs) of Gaussians. We propose Spiking GS to reduce such two types of LOPs by integrating spiking neurons into the Gaussian Splatting pipeline. Specifically, we introduce global and local full-precision integrate-and-fire spiking neurons to the opacity and representation function of flattened 3D Gaussians, respectively. Furthermore, we enhance the density control strategy with spiking neurons' thresholds and a new criterion on the scale of Gaussians. Our method can represent more accurate reconstructed surfaces at a lower cost. The supplementary material and code are available at https://github.com/zju-bmi-lab/SpikingGS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07266v5-abstract-full').style.display = 'none'; document.getElementById('2410.07266v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05684">arXiv:2410.05684</a> <span> [<a href="https://arxiv.org/pdf/2410.05684">pdf</a>, <a href="https://arxiv.org/format/2410.05684">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Copiloting Diagnosis of Autism in Real Clinical Scenarios via LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yi Jiang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Q">Qingyang Shen</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+S">Shuzhong Lai</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+S">Shunyu Qi</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+L">Lin Yao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yueming Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05684v2-abstract-short" style="display: inline;"> Autism spectrum disorder(ASD) is a pervasive developmental disorder that significantly impacts the daily functioning and social participation of individuals. Despite the abundance of research focused on supporting the clinical diagnosis of ASD, there is still a lack of systematic and comprehensive exploration in the field of methods based on Large Language Models (LLMs), particularly regarding the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05684v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05684v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05684v2-abstract-full" style="display: none;"> Autism spectrum disorder(ASD) is a pervasive developmental disorder that significantly impacts the daily functioning and social participation of individuals. Despite the abundance of research focused on supporting the clinical diagnosis of ASD, there is still a lack of systematic and comprehensive exploration in the field of methods based on Large Language Models (LLMs), particularly regarding the real-world clinical diagnostic scenarios based on Autism Diagnostic Observation Schedule, Second Edition (ADOS-2). Therefore, we have proposed a framework called ADOS-Copilot, which strikes a balance between scoring and explanation and explored the factors that influence the performance of LLMs in this task. The experimental results indicate that our proposed framework is competitive with the diagnostic results of clinicians, with a minimum MAE of 0.4643, binary classification F1-score of 81.79\%, and ternary classification F1-score of 78.37\%. Furthermore, we have systematically elucidated the strengths and limitations of current LLMs in this task from the perspectives of ADOS-2, LLMs' capabilities, language, and model scale aiming to inspire and guide the future application of LLMs in a broader fields of mental health disorders. We hope for more research to be transferred into real clinical practice, opening a window of kindness to the world for eccentric children. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05684v2-abstract-full').style.display = 'none'; document.getElementById('2410.05684v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08579">arXiv:2409.08579</a> <span> [<a href="https://arxiv.org/pdf/2409.08579">pdf</a>, <a href="https://arxiv.org/ps/2409.08579">ps</a>, <a href="https://arxiv.org/format/2409.08579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Secure Offloading in NOMA-Aided Aerial MEC Systems Based on Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingxu Yang</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08579v2-abstract-short" style="display: inline;"> Mobile edge computing (MEC) technology can reduce user latency and energy consumption by offloading computationally intensive tasks to the edge servers. Unmanned aerial vehicles (UAVs) and non-orthogonal multiple access (NOMA) technology enable the MEC networks to provide offloaded computing services for massively accessed terrestrial users conveniently. However, the broadcast nature of signal pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08579v2-abstract-full').style.display = 'inline'; document.getElementById('2409.08579v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08579v2-abstract-full" style="display: none;"> Mobile edge computing (MEC) technology can reduce user latency and energy consumption by offloading computationally intensive tasks to the edge servers. Unmanned aerial vehicles (UAVs) and non-orthogonal multiple access (NOMA) technology enable the MEC networks to provide offloaded computing services for massively accessed terrestrial users conveniently. However, the broadcast nature of signal propagation in NOMA-based UAV-MEC networks makes it vulnerable to eavesdropping by malicious eavesdroppers. In this work, a secure offload scheme is proposed for NOMA-based UAV-MEC systems with the existence of an aerial eavesdropper. The long-term average network computational cost is minimized by jointly designing the UAV's trajectory, the terrestrial users' transmit power, and computational frequency while ensuring the security of users' offloaded data. Due to the eavesdropper's location uncertainty, the worst-case security scenario is considered through the estimated eavesdropping range. Due to the high-dimensional continuous action space, the deep deterministic policy gradient algorithm is utilized to solve the non-convex optimization problem. Simulation results validate the effectiveness of the proposed scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08579v2-abstract-full').style.display = 'none'; document.getElementById('2409.08579v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 7 figures, accepted by IEEE Journal on Miniaturization for Air and Space Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02111">arXiv:2409.02111</a> <span> [<a href="https://arxiv.org/pdf/2409.02111">pdf</a>, <a href="https://arxiv.org/format/2409.02111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Toward Large-scale Spiking Neural Networks: A Comprehensive Survey and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yangfan Hu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guoqi Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02111v1-abstract-short" style="display: inline;"> Deep learning has revolutionized artificial intelligence (AI), achieving remarkable progress in fields such as computer vision, speech recognition, and natural language processing. Moreover, the recent success of large language models (LLMs) has fueled a surge in research on large-scale neural networks. However, the escalating demand for computing resources and energy consumption has prompted the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02111v1-abstract-full').style.display = 'inline'; document.getElementById('2409.02111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02111v1-abstract-full" style="display: none;"> Deep learning has revolutionized artificial intelligence (AI), achieving remarkable progress in fields such as computer vision, speech recognition, and natural language processing. Moreover, the recent success of large language models (LLMs) has fueled a surge in research on large-scale neural networks. However, the escalating demand for computing resources and energy consumption has prompted the search for energy-efficient alternatives. Inspired by the human brain, spiking neural networks (SNNs) promise energy-efficient computation with event-driven spikes. To provide future directions toward building energy-efficient large SNN models, we present a survey of existing methods for developing deep spiking neural networks, with a focus on emerging Spiking Transformers. Our main contributions are as follows: (1) an overview of learning methods for deep spiking neural networks, categorized by ANN-to-SNN conversion and direct training with surrogate gradients; (2) an overview of network architectures for deep spiking neural networks, categorized by deep convolutional neural networks (DCNNs) and Transformer architecture; and (3) a comprehensive comparison of state-of-the-art deep SNNs with a focus on emerging Spiking Transformers. We then further discuss and outline future directions toward large-scale SNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02111v1-abstract-full').style.display = 'none'; document.getElementById('2409.02111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16564">arXiv:2408.16564</a> <span> [<a href="https://arxiv.org/pdf/2408.16564">pdf</a>, <a href="https://arxiv.org/format/2408.16564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing Interaction and Causal Processing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qianhui Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiadong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16564v1-abstract-short" style="display: inline;"> Humans naturally perform audiovisual speech recognition (AVSR), enhancing the accuracy and robustness by integrating auditory and visual information. Spiking neural networks (SNNs), which mimic the brain's information-processing mechanisms, are well-suited for emulating the human capability of AVSR. Despite their potential, research on SNNs for AVSR is scarce, with most existing audio-visual multi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16564v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16564v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16564v1-abstract-full" style="display: none;"> Humans naturally perform audiovisual speech recognition (AVSR), enhancing the accuracy and robustness by integrating auditory and visual information. Spiking neural networks (SNNs), which mimic the brain's information-processing mechanisms, are well-suited for emulating the human capability of AVSR. Despite their potential, research on SNNs for AVSR is scarce, with most existing audio-visual multimodal methods focused on object or digit recognition. These models simply integrate features from both modalities, neglecting their unique characteristics and interactions. Additionally, they often rely on future information for current processing, which increases recognition latency and limits real-time applicability. Inspired by human speech perception, this paper proposes a novel human-inspired SNN named HI-AVSNN for AVSR, incorporating three key characteristics: cueing interaction, causal processing and spike activity. For cueing interaction, we propose a visual-cued auditory attention module (VCA2M) that leverages visual cues to guide attention to auditory features. We achieve causal processing by aligning the SNN's temporal dimension with that of visual and auditory features and applying temporal masking to utilize only past and current information. To implement spike activity, in addition to using SNNs, we leverage the event camera to capture lip movement as spikes, mimicking the human retina and providing efficient visual data. We evaluate HI-AVSNN on an audiovisual speech recognition dataset combining the DVS-Lip dataset with its corresponding audio samples. Experimental results demonstrate the superiority of our proposed fusion method, outperforming existing audio-visual SNN fusion methods and achieving a 2.27% improvement in accuracy over the only existing SNN-based AVSR method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16564v1-abstract-full').style.display = 'none'; document.getElementById('2408.16564v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20947">arXiv:2407.20947</a> <span> [<a href="https://arxiv.org/pdf/2407.20947">pdf</a>, <a href="https://arxiv.org/format/2407.20947">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> An Asynchronous Multi-core Accelerator for SNN inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+D">De Ma</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xiaofei Jin</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Q">Qinghui Xing</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+O">Ouwen Jin</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xin Du</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shuibing He</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20947v1-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) are extensively utilized in brain-inspired computing and neuroscience research. To enhance the speed and energy efficiency of SNNs, several many-core accelerators have been developed. However, maintaining the accuracy of SNNs often necessitates frequent explicit synchronization among all cores, which presents a challenge to overall efficiency. In this paper, we propo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20947v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20947v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20947v1-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) are extensively utilized in brain-inspired computing and neuroscience research. To enhance the speed and energy efficiency of SNNs, several many-core accelerators have been developed. However, maintaining the accuracy of SNNs often necessitates frequent explicit synchronization among all cores, which presents a challenge to overall efficiency. In this paper, we propose an asynchronous architecture for Spiking Neural Networks (SNNs) that eliminates the need for inter-core synchronization, thus enhancing speed and energy efficiency. This approach leverages the pre-determined dependencies of neuromorphic cores established during compilation. Each core is equipped with a scheduler that monitors the status of its dependencies, allowing it to safely advance to the next timestep without waiting for other cores. This eliminates the necessity for global synchronization and minimizes core waiting time despite inherent workload imbalances. Comprehensive evaluations using five different SNN workloads show that our architecture achieves a 1.86x speedup and a 1.55x increase in energy efficiency compared to state-of-the-art synchronization architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20947v1-abstract-full').style.display = 'none'; document.getElementById('2407.20947v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20852">arXiv:2407.20852</a> <span> [<a href="https://arxiv.org/pdf/2407.20852">pdf</a>, <a href="https://arxiv.org/format/2407.20852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Optimizing 5G-Advanced Networks for Time-critical Applications: The Role of L4S </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Pin Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20852v1-abstract-short" style="display: inline;"> As 5G networks strive to support advanced time-critical applications, such as immersive Extended Reality (XR), cloud gaming, and autonomous driving, the demand for Real-time Broadband Communication (RTBC) grows. In this article, we present the main mechanisms of Low Latency, Low Loss, and Scalable Throughput (L4S). Subsequently, we investigate the support and challenges of L4S technology in the la… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20852v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20852v1-abstract-full" style="display: none;"> As 5G networks strive to support advanced time-critical applications, such as immersive Extended Reality (XR), cloud gaming, and autonomous driving, the demand for Real-time Broadband Communication (RTBC) grows. In this article, we present the main mechanisms of Low Latency, Low Loss, and Scalable Throughput (L4S). Subsequently, we investigate the support and challenges of L4S technology in the latest 3GPP 5G-Advanced Release 18 (R18) standard. Our case study, using a prototype system for a real-time communication (RTC) application, demonstrates the superiority of L4S technology. The experimental results show that, compared with the GCC algorithm, the proposed L4S-GCC algorithm can reduce the stalling rate by 1.51%-2.80% and increase the bandwidth utilization by 11.4%-31.4%. The results emphasize the immense potential of L4S technology in enhancing transmission performance in time-critical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20852v1-abstract-full').style.display = 'none'; document.getElementById('2407.20852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 3 figures. This work has been submitted to the IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19271">arXiv:2407.19271</a> <span> [<a href="https://arxiv.org/pdf/2407.19271">pdf</a>, <a href="https://arxiv.org/format/2407.19271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Sewer Image Super-Resolution with Depth Priors and Its Lightweight Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Z">Zhijie Sui</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuai Guo</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yaozhi Lv</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Honglie Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Di Sun</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Z">Zixia Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19271v2-abstract-short" style="display: inline;"> The Quick-view (QV) technique serves as a primary method for detecting defects within sewerage systems. However, the effectiveness of QV is impeded by the limited visual range of its hardware, resulting in suboptimal image quality for distant portions of the sewer network. Image super-resolution is an effective way to improve image quality and has been applied in a variety of scenes. However, rese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19271v2-abstract-full').style.display = 'inline'; document.getElementById('2407.19271v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19271v2-abstract-full" style="display: none;"> The Quick-view (QV) technique serves as a primary method for detecting defects within sewerage systems. However, the effectiveness of QV is impeded by the limited visual range of its hardware, resulting in suboptimal image quality for distant portions of the sewer network. Image super-resolution is an effective way to improve image quality and has been applied in a variety of scenes. However, research on super-resolution for sewer images remains considerably unexplored. In response, this study leverages the inherent depth relationships present within QV images and introduces a novel Depth-guided, Reference-based Super-Resolution framework denoted as DSRNet. It comprises two core components: a depth extraction module and a depth information matching module (DMM). DSRNet utilizes the adjacent frames of the low-resolution image as reference images and helps them recover texture information based on the correlation. By combining these modules, the integration of depth priors significantly enhances both visual quality and performance benchmarks. Besides, in pursuit of computational efficiency and compactness, a super-resolution knowledge distillation model based on an attention mechanism is introduced. This mechanism facilitates the acquisition of feature similarity between a more complex teacher model and a streamlined student model, with the latter being a lightweight version of DSRNet. Experimental results demonstrate that DSRNet significantly improves PSNR and SSIM compared with other methods. This study also conducts experiments on sewer defect semantic segmentation, object detection, and classification on the Pipe dataset and Sewer-ML dataset. Experiments show that the method can improve the performance of low-resolution sewer images in these tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19271v2-abstract-full').style.display = 'none'; document.getElementById('2407.19271v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15170">arXiv:2407.15170</a> <span> [<a href="https://arxiv.org/pdf/2407.15170">pdf</a>, <a href="https://arxiv.org/format/2407.15170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semi-Supervised Pipe Video Temporal Defect Interval Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhu Huang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+C">Chao Kang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">YaoZhi Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15170v1-abstract-short" style="display: inline;"> In sewer pipe Closed-Circuit Television (CCTV) inspection, accurate temporal defect localization is essential for effective defect classification, detection, segmentation and quantification. Industry standards typically do not require time-interval annotations, even though they are more informative than time-point annotations for defect localization, resulting in additional annotation costs when f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15170v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15170v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15170v1-abstract-full" style="display: none;"> In sewer pipe Closed-Circuit Television (CCTV) inspection, accurate temporal defect localization is essential for effective defect classification, detection, segmentation and quantification. Industry standards typically do not require time-interval annotations, even though they are more informative than time-point annotations for defect localization, resulting in additional annotation costs when fully supervised methods are used. Additionally, differences in scene types and camera motion patterns between pipe inspections and Temporal Action Localization (TAL) hinder the effective transfer of point-supervised TAL methods. Therefore, this study introduces a Semi-supervised multi-Prototype-based method incorporating visual Odometry for enhanced attention guidance (PipeSPO). PipeSPO fully leverages unlabeled data through unsupervised pretext tasks and utilizes time-point annotated data with a weakly supervised multi-prototype-based method, relying on visual odometry features to capture camera pose information. Experiments on real-world datasets demonstrate that PipeSPO achieves 41.89% average precision across Intersection over Union (IoU) thresholds of 0.1-0.7, improving by 8.14% over current state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15170v1-abstract-full').style.display = 'none'; document.getElementById('2407.15170v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07314">arXiv:2407.07314</a> <span> [<a href="https://arxiv.org/pdf/2407.07314">pdf</a>, <a href="https://arxiv.org/ps/2407.07314">ps</a>, <a href="https://arxiv.org/format/2407.07314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Proactive Eavesdropping in Relay Systems via Trajectory and Power Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dan%2C+Q">Qian Dan</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+W">Weijia Lei</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07314v1-abstract-short" style="display: inline;"> Wireless relays can effectively extend the transmission range of information. However, if relay technology is utilized unlawfully, it can amplify potential harm. Effectively surveilling illegitimate relay links poses a challenging problem. Unmanned aerial vehicles (UAVs) can proactively surveil wireless relay systems due to their flexible mobility. This work focuses on maximizing the eavesdropping… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07314v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07314v1-abstract-full" style="display: none;"> Wireless relays can effectively extend the transmission range of information. However, if relay technology is utilized unlawfully, it can amplify potential harm. Effectively surveilling illegitimate relay links poses a challenging problem. Unmanned aerial vehicles (UAVs) can proactively surveil wireless relay systems due to their flexible mobility. This work focuses on maximizing the eavesdropping rate (ER) of UAVs by jointly optimizing the trajectory and jamming power. To address this challenge, we propose a new iterative algorithm based on block coordinate descent and successive convex approximation technologies. Simulation results demonstrate that the proposed algorithm significantly enhances the ER through trajectory and jamming power optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07314v1-abstract-full').style.display = 'none'; document.getElementById('2407.07314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 8 figures, submitted to IEEE Journal for review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06521">arXiv:2407.06521</a> <span> [<a href="https://arxiv.org/pdf/2407.06521">pdf</a>, <a href="https://arxiv.org/ps/2407.06521">ps</a>, <a href="https://arxiv.org/format/2407.06521">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Beamforming Design for Joint Target Sensing and Proactive Eavesdropping </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dan%2C+Q">Qian Dan</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06521v1-abstract-short" style="display: inline;"> This work studies the beamforming design in the joint target sensing and proactive eavesdropping (JTSAPE) system. The JTSAPE base station (BS) receives the information transmitted by the illegal transmitter and transmits the waveform for target sensing. The shared waveform also serves as artificial noise to interfere with the illegal receiver, thereby achieving proactive eavesdropping. We firstly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06521v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06521v1-abstract-full" style="display: none;"> This work studies the beamforming design in the joint target sensing and proactive eavesdropping (JTSAPE) system. The JTSAPE base station (BS) receives the information transmitted by the illegal transmitter and transmits the waveform for target sensing. The shared waveform also serves as artificial noise to interfere with the illegal receiver, thereby achieving proactive eavesdropping. We firstly optimize the transmitting beam of the BS to maximize the eavesdropping signal-to-interference-plus-noise ratio or minimize the target estimation parameter Cram{茅}r-Rao bound, respectively. Then, the joint optimization of proactive eavesdropping and target sensing is investigated, and the normalized weighted optimization problem is formulated. To address the complexity of the original problem, the formulated problem is decomposed into two subproblems: proactive eavesdropping and target sensing, which are solved by the semi-definite relaxation technique. Furthermore, the scenario in which the quality of the eavesdropping channel is stronger than that of the illegal channel is considered. We utilize the sequential rank-one constraint relaxation method and iteration technique to obtain the high-quality suboptimal solution of the beam transmit covariance matrix. Numerical simulation shows the effectiveness of our proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06521v1-abstract-full').style.display = 'none'; document.getElementById('2407.06521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 6 figures, submitted to IEEE Journal for review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07268">arXiv:2406.07268</a> <span> [<a href="https://arxiv.org/pdf/2406.07268">pdf</a>, <a href="https://arxiv.org/format/2406.07268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Advancing Grounded Multimodal Named Entity Recognition via LLM-Based Reformulation and Box-Based Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziyan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jianfei Yu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+R">Rui Xia</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Di Sun</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07268v1-abstract-short" style="display: inline;"> Grounded Multimodal Named Entity Recognition (GMNER) task aims to identify named entities, entity types and their corresponding visual regions. GMNER task exhibits two challenging attributes: 1) The tenuous correlation between images and text on social media contributes to a notable proportion of named entities being ungroundable. 2) There exists a distinction between coarse-grained noun phrases u… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07268v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07268v1-abstract-full" style="display: none;"> Grounded Multimodal Named Entity Recognition (GMNER) task aims to identify named entities, entity types and their corresponding visual regions. GMNER task exhibits two challenging attributes: 1) The tenuous correlation between images and text on social media contributes to a notable proportion of named entities being ungroundable. 2) There exists a distinction between coarse-grained noun phrases used in similar tasks (e.g., phrase localization) and fine-grained named entities. In this paper, we propose RiVEG, a unified framework that reformulates GMNER into a joint MNER-VE-VG task by leveraging large language models (LLMs) as connecting bridges. This reformulation brings two benefits: 1) It enables us to optimize the MNER module for optimal MNER performance and eliminates the need to pre-extract region features using object detection methods, thus naturally addressing the two major limitations of existing GMNER methods. 2) The introduction of Entity Expansion Expression module and Visual Entailment (VE) module unifies Visual Grounding (VG) and Entity Grounding (EG). This endows the proposed framework with unlimited data and model scalability. Furthermore, to address the potential ambiguity stemming from the coarse-grained bounding box output in GMNER, we further construct the new Segmented Multimodal Named Entity Recognition (SMNER) task and corresponding Twitter-SMNER dataset aimed at generating fine-grained segmentation masks, and experimentally demonstrate the feasibility and effectiveness of using box prompt-based Segment Anything Model (SAM) to empower any GMNER model with the ability to accomplish the SMNER task. Extensive experiments demonstrate that RiVEG significantly outperforms SoTA methods on four datasets across the MNER, GMNER, and SMNER tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07268v1-abstract-full').style.display = 'none'; document.getElementById('2406.07268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extension of our Findings of EMNLP 2023 & ACL 2024 paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.06842">arXiv:2406.06842</a> <span> [<a href="https://arxiv.org/pdf/2406.06842">pdf</a>, <a href="https://arxiv.org/ps/2406.06842">ps</a>, <a href="https://arxiv.org/format/2406.06842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Aerial Relay to Achieve Covertness and Security </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jiacheng Jiang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.06842v1-abstract-short" style="display: inline;"> In this work, a delay-tolerant unmanned aerial vehicle (UAV) relayed covert and secure communication framework is investigated. In this framework, a legitimate UAV serves as an aerial relay to realize communication when the direct link between the terrestrial transmitter and receiver is blocked and also acts as a friendly jammer to suppress the malicious nodes presented on the ground. Subsequently… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06842v1-abstract-full').style.display = 'inline'; document.getElementById('2406.06842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.06842v1-abstract-full" style="display: none;"> In this work, a delay-tolerant unmanned aerial vehicle (UAV) relayed covert and secure communication framework is investigated. In this framework, a legitimate UAV serves as an aerial relay to realize communication when the direct link between the terrestrial transmitter and receiver is blocked and also acts as a friendly jammer to suppress the malicious nodes presented on the ground. Subsequently, considering the uncertainty of malicious nodes' positions, a robust fractional programming optimization problem is built to maximize energy efficiency by jointly optimizing the trajectory of the UAV, the transmit power of the transmitter, and the time-switching factor. For the extremely complicated covert constraint, Pinsker's inequality, Jensen's inequality, and the bisection search method are employed to construct a tractable shrunken one. After this, an alternate optimization-based algorithm is proposed to solve the fractional programming optimization problem. To achieve low complexity, we design the primal-dual search-based algorithm and the successive convex approximation-based algorithm, respectively, for each sub-problem. Numerical results show the effectiveness of our proposed algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.06842v1-abstract-full').style.display = 'none'; document.getElementById('2406.06842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures, submitted to IEEE Journal for review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05936">arXiv:2406.05936</a> <span> [<a href="https://arxiv.org/pdf/2406.05936">pdf</a>, <a href="https://arxiv.org/ps/2406.05936">ps</a>, <a href="https://arxiv.org/format/2406.05936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Multi-UAV Trajectory Design for Fair and Secure Communication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+D">Dongyang Meng</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+H">Haoxiang Ran</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05936v1-abstract-short" style="display: inline;"> Unmanned aerial vehicles (UAVs) play an essential role in future wireless communication networks due to their high mobility, low cost, and on-demand deployment. In air-to-ground links, UAVs are widely used to enhance the performance of wireless communication systems due to the presence of high-probability line-of-sight (LoS) links. However, the high probability of LoS links also increases the risk… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05936v1-abstract-full').style.display = 'inline'; document.getElementById('2406.05936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05936v1-abstract-full" style="display: none;"> Unmanned aerial vehicles (UAVs) play an essential role in future wireless communication networks due to their high mobility, low cost, and on-demand deployment. In air-to-ground links, UAVs are widely used to enhance the performance of wireless communication systems due to the presence of high-probability line-of-sight (LoS) links. However, the high probability of LoS links also increases the risk of being eavesdropped, posing a significant challenge to the security of wireless communications. In this work, the secure communication problem in a multi-UAV-assisted communication system is investigated in a moving airborne eavesdropping scenario. To improve the secrecy performance of the considered communication system, aerial eavesdropping capability is suppressed by sending jamming signals from a friendly UAV. An optimization problem under flight conditions, fairness, and limited energy consumption constraints of multiple UAVs is formulated to maximize the fair sum secrecy throughput. Given the complexity and non-convex nature of the problem, we propose a two-step-based optimization approach. The first step employs the $K$-means algorithm to cluster users and associate them with multiple communication UAVs. Then, a multi-agent deep deterministic policy gradient-based algorithm is introduced to solve this optimization problem. The effectiveness of this proposed algorithm is not only theoretically but also rigorously verified by simulation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05936v1-abstract-full').style.display = 'none'; document.getElementById('2406.05936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 10 figures, submitted to IEEE Journal for review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01883">arXiv:2406.01883</a> <span> [<a href="https://arxiv.org/pdf/2406.01883">pdf</a>, <a href="https://arxiv.org/format/2406.01883">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Context Gating in Spiking Neural Networks: Achieving Lifelong Learning through Integration of Local and Global Plasticity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiangrong Shen</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+W">Wenyao Ni</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Huajin Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01883v1-abstract-short" style="display: inline;"> Humans learn multiple tasks in succession with minimal mutual interference, through the context gating mechanism in the prefrontal cortex (PFC). The brain-inspired models of spiking neural networks (SNN) have drawn massive attention for their energy efficiency and biological plausibility. To overcome catastrophic forgetting when learning multiple tasks in sequence, current SNN models for lifelong… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01883v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01883v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01883v1-abstract-full" style="display: none;"> Humans learn multiple tasks in succession with minimal mutual interference, through the context gating mechanism in the prefrontal cortex (PFC). The brain-inspired models of spiking neural networks (SNN) have drawn massive attention for their energy efficiency and biological plausibility. To overcome catastrophic forgetting when learning multiple tasks in sequence, current SNN models for lifelong learning focus on memory reserving or regularization-based modification, while lacking SNN to replicate human experimental behavior. Inspired by biological context-dependent gating mechanisms found in PFC, we propose SNN with context gating trained by the local plasticity rule (CG-SNN) for lifelong learning. The iterative training between global and local plasticity for task units is designed to strengthen the connections between task neurons and hidden neurons and preserve the multi-task relevant information. The experiments show that the proposed model is effective in maintaining the past learning experience and has better task-selectivity than other methods during lifelong learning. Our results provide new insights that the CG-SNN model can extend context gating with good scalability on different SNN architectures with different spike-firing mechanisms. Thus, our models have good potential for parallel implementation on neuromorphic hardware and model human's behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01883v1-abstract-full').style.display = 'none'; document.getElementById('2406.01883v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01313">arXiv:2406.01313</a> <span> [<a href="https://arxiv.org/pdf/2406.01313">pdf</a>, <a href="https://arxiv.org/ps/2406.01313">ps</a>, <a href="https://arxiv.org/format/2406.01313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> 3D Trajectory Design for Energy-constrained Aerial CRNs Under Probabilistic LoS Channel </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaqiu Wu</a>, <a href="/search/cs?searchtype=author&query=Park%2C+K">Ki-Hong Park</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01313v1-abstract-short" style="display: inline;"> Unmanned aerial vehicles (UAVs) have been attracting significant attention because there is a high probability of line-of-sight links being obtained between them and terrestrial nodes in high-rise urban areas. In this work, we investigate cognitive radio networks (CRNs) by jointly designing three-dimensional (3D) trajectory, the transmit power of the UAV, and user scheduling. Considering the UAV's… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01313v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01313v1-abstract-full" style="display: none;"> Unmanned aerial vehicles (UAVs) have been attracting significant attention because there is a high probability of line-of-sight links being obtained between them and terrestrial nodes in high-rise urban areas. In this work, we investigate cognitive radio networks (CRNs) by jointly designing three-dimensional (3D) trajectory, the transmit power of the UAV, and user scheduling. Considering the UAV's onboard energy consumption, an optimization problem is formulated in which the average achievable rate of the considered system is maximized by jointly optimizing the UAV's 3D trajectory, transmission power, and user scheduling. Due to the non-convex optimization problem, a lower bound on the average achievable rate is utilized to reduce the complexity of the solution. Subsequently, the original optimization problem is decoupled into four subproblems by using block coordinate descent, and each subproblem is transformed into manageable convex optimization problems by introducing slack variables and successive convex approximation. Numerical results validate the effectiveness of our proposed algorithm and demonstrate that the 3D trajectories of UAVs can enhance the average achievable rate of aerial CRNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01313v1-abstract-full').style.display = 'none'; document.getElementById('2406.01313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures,submitted to the IEEE journal for review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01072">arXiv:2406.01072</a> <span> [<a href="https://arxiv.org/pdf/2406.01072">pdf</a>, <a href="https://arxiv.org/format/2406.01072">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Towards Efficient Deep Spiking Neural Networks Construction with Spiking Activity based Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yaxin Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qi Xu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiangrong Shen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongming Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01072v1-abstract-short" style="display: inline;"> The emergence of deep and large-scale spiking neural networks (SNNs) exhibiting high performance across diverse complex datasets has led to a need for compressing network models due to the presence of a significant number of redundant structural units, aiming to more effectively leverage their low-power consumption and biological interpretability advantages. Currently, most model compression techn… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01072v1-abstract-full').style.display = 'inline'; document.getElementById('2406.01072v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01072v1-abstract-full" style="display: none;"> The emergence of deep and large-scale spiking neural networks (SNNs) exhibiting high performance across diverse complex datasets has led to a need for compressing network models due to the presence of a significant number of redundant structural units, aiming to more effectively leverage their low-power consumption and biological interpretability advantages. Currently, most model compression techniques for SNNs are based on unstructured pruning of individual connections, which requires specific hardware support. Hence, we propose a structured pruning approach based on the activity levels of convolutional kernels named Spiking Channel Activity-based (SCA) network pruning framework. Inspired by synaptic plasticity mechanisms, our method dynamically adjusts the network's structure by pruning and regenerating convolutional kernels during training, enhancing the model's adaptation to the current target task. While maintaining model performance, this approach refines the network architecture, ultimately reducing computational load and accelerating the inference process. This indicates that structured dynamic sparse learning methods can better facilitate the application of deep SNNs in low-power and high-efficiency scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01072v1-abstract-full').style.display = 'none'; document.getElementById('2406.01072v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19194">arXiv:2405.19194</a> <span> [<a href="https://arxiv.org/pdf/2405.19194">pdf</a>, <a href="https://arxiv.org/format/2405.19194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LOGO: Video Text Spotting with Language Collaboration and Glyph Perception Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hongen Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Di Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yi Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19194v2-abstract-short" style="display: inline;"> Video text spotting (VTS) aims to simultaneously localize, recognize and track text instances in videos. To address the limited recognition capability of end-to-end methods, recent methods track the zero-shot results of state-of-the-art image text spotters directly, and achieve impressive performance. However, owing to the domain gap between different datasets, these methods usually obtain limited… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19194v2-abstract-full').style.display = 'inline'; document.getElementById('2405.19194v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19194v2-abstract-full" style="display: none;"> Video text spotting (VTS) aims to simultaneously localize, recognize and track text instances in videos. To address the limited recognition capability of end-to-end methods, recent methods track the zero-shot results of state-of-the-art image text spotters directly, and achieve impressive performance. However, owing to the domain gap between different datasets, these methods usually obtain limited tracking trajectories on extreme dataset. Fine-tuning transformer-based text spotters on specific datasets could yield performance enhancements, albeit at the expense of considerable training resources. In this paper, we propose a Language Collaboration and Glyph Perception Model, termed LOGO, an innovative framework designed to enhance the performance of conventional text spotters. To achieve this goal, we design a language synergy classifier (LSC) to explicitly discern text instances from background noise in the recognition stage. Specially, the language synergy classifier can output text content or background code based on the legibility of text regions, thus computing language scores. Subsequently, fusion scores are computed by taking the average of detection scores and language scores, and are utilized to re-score the detection results before tracking. By the re-scoring mechanism, the proposed LSC facilitates the detection of low-resolution text instances while filtering out text-like regions. Moreover, the glyph supervision is introduced to enhance the recognition accuracy of noisy text regions. In addition, we propose the visual position mixture module, which can merge the position information and visual features efficiently, and acquire more discriminative tracking features. Extensive experiments on public benchmarks validate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19194v2-abstract-full').style.display = 'none'; document.getElementById('2405.19194v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.17879">arXiv:2405.17879</a> <span> [<a href="https://arxiv.org/pdf/2405.17879">pdf</a>, <a href="https://arxiv.org/format/2405.17879">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Resisting Stochastic Risks in Diffusion Planners with the Trajectory Aggregation Tree </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+L">Lang Feng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Pengjie Gu</a>, <a href="/search/cs?searchtype=author&query=An%2C+B">Bo An</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.17879v2-abstract-short" style="display: inline;"> Diffusion planners have shown promise in handling long-horizon and sparse-reward tasks due to the non-autoregressive plan generation. However, their inherent stochastic risk of generating infeasible trajectories presents significant challenges to their reliability and stability. We introduce a novel approach, the Trajectory Aggregation Tree (TAT), to address this issue in diffusion planners. Compa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17879v2-abstract-full').style.display = 'inline'; document.getElementById('2405.17879v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.17879v2-abstract-full" style="display: none;"> Diffusion planners have shown promise in handling long-horizon and sparse-reward tasks due to the non-autoregressive plan generation. However, their inherent stochastic risk of generating infeasible trajectories presents significant challenges to their reliability and stability. We introduce a novel approach, the Trajectory Aggregation Tree (TAT), to address this issue in diffusion planners. Compared to prior methods that rely solely on raw trajectory predictions, TAT aggregates information from both historical and current trajectories, forming a dynamic tree-like structure. Each trajectory is conceptualized as a branch and individual states as nodes. As the structure evolves with the integration of new trajectories, unreliable states are marginalized, and the most impactful nodes are prioritized for decision-making. TAT can be deployed without modifying the original training and sampling pipelines of diffusion planners, making it a training-free, ready-to-deploy solution. We provide both theoretical analysis and empirical evidence to support TAT's effectiveness. Our results highlight its remarkable ability to resist the risk from unreliable trajectories, guarantee the performance boosting of diffusion planners in $100\%$ of tasks, and exhibit an appreciable tolerance margin for sample quality, thereby enabling planning with a more than $3\times$ acceleration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.17879v2-abstract-full').style.display = 'none'; document.getElementById('2405.17879v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2024 (Spotlight)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.07689">arXiv:2405.07689</a> <span> [<a href="https://arxiv.org/pdf/2405.07689">pdf</a>, <a href="https://arxiv.org/format/2405.07689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Quality of Experience Optimization for Real-time XR Video Transmission with Energy Constraints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaojing Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanzan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.07689v1-abstract-short" style="display: inline;"> Extended Reality (XR) is an important service in the 5G network and in future 6G networks. In contrast to traditional video on demand services, real-time XR video is transmitted frame-by-frame, requiring low latency and being highly sensitive to network fluctuations. In this paper, we model the quality of experience (QoE) for real-time XR video transmission on a frame-by-frame basis. Based on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07689v1-abstract-full').style.display = 'inline'; document.getElementById('2405.07689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.07689v1-abstract-full" style="display: none;"> Extended Reality (XR) is an important service in the 5G network and in future 6G networks. In contrast to traditional video on demand services, real-time XR video is transmitted frame-by-frame, requiring low latency and being highly sensitive to network fluctuations. In this paper, we model the quality of experience (QoE) for real-time XR video transmission on a frame-by-frame basis. Based on the proposed QoE model, we formulate an optimization problem that maximizes QoE with constraints on wireless resources and long-term energy consumption. We utilize Lyapunov optimization to transform the original problem into a single-frame optimization problem and then allocate wireless subchannels. We propose an adaptive XR video bitrate algorithm that employs a Long Short Term Memory (LSTM) based Deep Q-Network (DQN) algorithm for video bitrate selection. Through numerical results, we show that our proposed algorithm outperforms the baseline algorithms, with the average QoE improvements of 0.04 to 0.46. Specifically, compared to baseline algorithms, the proposed algorithm reduces average video quality variations by 29% to 50% and improves the frame transmission success rate by 5% to 48%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.07689v1-abstract-full').style.display = 'none'; document.getElementById('2405.07689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02572">arXiv:2405.02572</a> <span> [<a href="https://arxiv.org/pdf/2405.02572">pdf</a>, <a href="https://arxiv.org/format/2405.02572">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Off-OAB: Off-Policy Policy Gradient Method with Optimal Action-Dependent Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+W">Wenjia Meng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Long Yang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Y">Yilong Yin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02572v1-abstract-short" style="display: inline;"> Policy-based methods have achieved remarkable success in solving challenging reinforcement learning problems. Among these methods, off-policy policy gradient methods are particularly important due to that they can benefit from off-policy data. However, these methods suffer from the high variance of the off-policy policy gradient (OPPG) estimator, which results in poor sample efficiency during trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02572v1-abstract-full').style.display = 'inline'; document.getElementById('2405.02572v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02572v1-abstract-full" style="display: none;"> Policy-based methods have achieved remarkable success in solving challenging reinforcement learning problems. Among these methods, off-policy policy gradient methods are particularly important due to that they can benefit from off-policy data. However, these methods suffer from the high variance of the off-policy policy gradient (OPPG) estimator, which results in poor sample efficiency during training. In this paper, we propose an off-policy policy gradient method with the optimal action-dependent baseline (Off-OAB) to mitigate this variance issue. Specifically, this baseline maintains the OPPG estimator's unbiasedness while theoretically minimizing its variance. To enhance practical computational efficiency, we design an approximated version of this optimal baseline. Utilizing this approximation, our method (Off-OAB) aims to decrease the OPPG estimator's variance during policy optimization. We evaluate the proposed Off-OAB method on six representative tasks from OpenAI Gym and MuJoCo, where it demonstrably surpasses state-of-the-art methods on the majority of these tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02572v1-abstract-full').style.display = 'none'; document.getElementById('2405.02572v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19582">arXiv:2404.19582</a> <span> [<a href="https://arxiv.org/pdf/2404.19582">pdf</a>, <a href="https://arxiv.org/format/2404.19582">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> URVFL: Undetectable Data Reconstruction Attack on Vertical Federated Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+D">Duanyi Yao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Songze Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+X">Xueluan Gong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+S">Sizai Hou</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaoning Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19582v2-abstract-short" style="display: inline;"> Launching effective malicious attacks in VFL presents unique challenges: 1) Firstly, given the distributed nature of clients' data features and models, each client rigorously guards its privacy and prohibits direct querying, complicating any attempts to steal data; 2) Existing malicious attacks alter the underlying VFL training task, and are hence easily detected by comparing the received gradient… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19582v2-abstract-full').style.display = 'inline'; document.getElementById('2404.19582v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19582v2-abstract-full" style="display: none;"> Launching effective malicious attacks in VFL presents unique challenges: 1) Firstly, given the distributed nature of clients' data features and models, each client rigorously guards its privacy and prohibits direct querying, complicating any attempts to steal data; 2) Existing malicious attacks alter the underlying VFL training task, and are hence easily detected by comparing the received gradients with the ones received in honest training. To overcome these challenges, we develop URVFL, a novel attack strategy that evades current detection mechanisms. The key idea is to integrate a discriminator with auxiliary classifier that takes a full advantage of the label information and generates malicious gradients to the victim clients: on one hand, label information helps to better characterize embeddings of samples from distinct classes, yielding an improved reconstruction performance; on the other hand, computing malicious gradients with label information better mimics the honest training, making the malicious gradients indistinguishable from the honest ones, and the attack much more stealthy. Our comprehensive experiments demonstrate that URVFL significantly outperforms existing attacks, and successfully circumvents SOTA detection methods for malicious attacks. Additional ablation studies and evaluations on defenses further underscore the robustness and effectiveness of URVFL. Our code will be available at https://github.com/duanyiyao/URVFL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19582v2-abstract-full').style.display = 'none'; document.getElementById('2404.19582v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NDSS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18225">arXiv:2404.18225</a> <span> [<a href="https://arxiv.org/pdf/2404.18225">pdf</a>, <a href="https://arxiv.org/format/2404.18225">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Quadruped robot traversing 3D complex environments with limited perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yi Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hang Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guoping Pan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+L">Linqi Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Houde Liu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Bin Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18225v3-abstract-short" style="display: inline;"> Traversing 3-D complex environments has always been a significant challenge for legged locomotion. Existing methods typically rely on external sensors such as vision and lidar to preemptively react to obstacles by acquiring environmental information. However, in scenarios like nighttime or dense forests, external sensors often fail to function properly, necessitating robots to rely on propriocepti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18225v3-abstract-full').style.display = 'inline'; document.getElementById('2404.18225v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18225v3-abstract-full" style="display: none;"> Traversing 3-D complex environments has always been a significant challenge for legged locomotion. Existing methods typically rely on external sensors such as vision and lidar to preemptively react to obstacles by acquiring environmental information. However, in scenarios like nighttime or dense forests, external sensors often fail to function properly, necessitating robots to rely on proprioceptive sensors to perceive diverse obstacles in the environment and respond promptly. This task is undeniably challenging. Our research finds that methods based on collision detection can enhance a robot's perception of environmental obstacles. In this work, we propose an end-to-end learning-based quadruped robot motion controller that relies solely on proprioceptive sensing. This controller can accurately detect, localize, and agilely respond to collisions in unknown and complex 3D environments, thereby improving the robot's traversability in complex environments. We demonstrate in both simulation and real-world experiments that our method enables quadruped robots to successfully traverse challenging obstacles in various complex environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18225v3-abstract-full').style.display = 'none'; document.getElementById('2404.18225v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 8 figures,submitted to iros2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.09905">arXiv:2404.09905</a> <span> [<a href="https://arxiv.org/pdf/2404.09905">pdf</a>, <a href="https://arxiv.org/format/2404.09905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Quality of Experience Oriented Cross-layer Optimization for Real-time XR Video Transmission </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaojing Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanzan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.09905v1-abstract-short" style="display: inline;"> Extended reality (XR) is one of the most important applications of beyond 5G and 6G networks. Real-time XR video transmission presents challenges in terms of data rate and delay. In particular, the frame-by-frame transmission mode of XR video makes real-time XR video very sensitive to dynamic network environments. To improve the users' quality of experience (QoE), we design a cross-layer transmiss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09905v1-abstract-full').style.display = 'inline'; document.getElementById('2404.09905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.09905v1-abstract-full" style="display: none;"> Extended reality (XR) is one of the most important applications of beyond 5G and 6G networks. Real-time XR video transmission presents challenges in terms of data rate and delay. In particular, the frame-by-frame transmission mode of XR video makes real-time XR video very sensitive to dynamic network environments. To improve the users' quality of experience (QoE), we design a cross-layer transmission framework for real-time XR video. The proposed framework allows the simple information exchange between the base station (BS) and the XR server, which assists in adaptive bitrate and wireless resource scheduling. We utilize the cross-layer information to formulate the problem of maximizing user QoE by finding the optimal scheduling and bitrate adjustment strategies. To address the issue of mismatched time scales between two strategies, we decouple the original problem and solve them individually using a multi-agent-based approach. Specifically, we propose the multi-step Deep Q-network (MS-DQN) algorithm to obtain a frame-priority-based wireless resource scheduling strategy and then propose the Transformer-based Proximal Policy Optimization (TPPO) algorithm for video bitrate adaptation. The experimental results show that the TPPO+MS-DQN algorithm proposed in this study can improve the QoE by 3.6% to 37.8%. More specifically, the proposed MS-DQN algorithm enhances the transmission quality by 49.9%-80.2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.09905v1-abstract-full').style.display = 'none'; document.getElementById('2404.09905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 13 figures. arXiv admin note: text overlap with arXiv:2402.01180</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01612">arXiv:2404.01612</a> <span> [<a href="https://arxiv.org/pdf/2404.01612">pdf</a>, <a href="https://arxiv.org/format/2404.01612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spin-UP: Spin Light for Natural Light Uncalibrated Photometric Stereo </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongrui Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhan Lu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+H">Haojie Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Boxin Shi</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xudong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01612v1-abstract-short" style="display: inline;"> Natural Light Uncalibrated Photometric Stereo (NaUPS) relieves the strict environment and light assumptions in classical Uncalibrated Photometric Stereo (UPS) methods. However, due to the intrinsic ill-posedness and high-dimensional ambiguities, addressing NaUPS is still an open question. Existing works impose strong assumptions on the environment lights and objects' material, restricting the effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01612v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01612v1-abstract-full" style="display: none;"> Natural Light Uncalibrated Photometric Stereo (NaUPS) relieves the strict environment and light assumptions in classical Uncalibrated Photometric Stereo (UPS) methods. However, due to the intrinsic ill-posedness and high-dimensional ambiguities, addressing NaUPS is still an open question. Existing works impose strong assumptions on the environment lights and objects' material, restricting the effectiveness in more general scenarios. Alternatively, some methods leverage supervised learning with intricate models while lacking interpretability, resulting in a biased estimation. In this work, we proposed Spin Light Uncalibrated Photometric Stereo (Spin-UP), an unsupervised method to tackle NaUPS in various environment lights and objects. The proposed method uses a novel setup that captures the object's images on a rotatable platform, which mitigates NaUPS's ill-posedness by reducing unknowns and provides reliable priors to alleviate NaUPS's ambiguities. Leveraging neural inverse rendering and the proposed training strategies, Spin-UP recovers surface normals, environment light, and isotropic reflectance under complex natural light with low computational cost. Experiments have shown that Spin-UP outperforms other supervised / unsupervised NaUPS methods and achieves state-of-the-art performance on synthetic and real-world datasets. Codes and data are available at https://github.com/LMozart/CVPR2024-SpinUP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01612v1-abstract-full').style.display = 'none'; document.getElementById('2404.01612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper accepted by CVPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17367">arXiv:2403.17367</a> <span> [<a href="https://arxiv.org/pdf/2403.17367">pdf</a>, <a href="https://arxiv.org/format/2403.17367">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RoboDuet: Learning a Cooperative Policy for Whole-body Legged Loco-Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guoping Pan</a>, <a href="/search/cs?searchtype=author&query=Ben%2C+Q">Qingwei Ben</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Z">Zhecheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+G">Guangqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yandong Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shoujie Li</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jiangmiao Pang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Houde Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Huazhe Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17367v5-abstract-short" style="display: inline;"> Fully leveraging the loco-manipulation capabilities of a quadruped robot equipped with a robotic arm is non-trivial, as it requires controlling all degrees of freedom (DoFs) of the quadruped robot to achieve effective whole-body coordination. In this letter, we propose a novel framework RoboDuet, which employs two collaborative policies to realize locomotion and manipulation simultaneously, achiev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17367v5-abstract-full').style.display = 'inline'; document.getElementById('2403.17367v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17367v5-abstract-full" style="display: none;"> Fully leveraging the loco-manipulation capabilities of a quadruped robot equipped with a robotic arm is non-trivial, as it requires controlling all degrees of freedom (DoFs) of the quadruped robot to achieve effective whole-body coordination. In this letter, we propose a novel framework RoboDuet, which employs two collaborative policies to realize locomotion and manipulation simultaneously, achieving whole-body control through mutual interactions. Beyond enabling large-range 6D pose tracking for manipulation, we find that the two-policy framework supports zero-shot transfer across quadruped robots with similar morphology and physical dimensions in the real world. Our experiments demonstrate that RoboDuet achieves a 23% improvement in success rate over the baseline in challenging loco-manipulation tasks employing whole-body control. To support further research, we provide open-source code and additional videos on our website: locomanip-duet.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17367v5-abstract-full').style.display = 'none'; document.getElementById('2403.17367v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09989">arXiv:2402.09989</a> <span> [<a href="https://arxiv.org/pdf/2402.09989">pdf</a>, <a href="https://arxiv.org/format/2402.09989">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLMs as Bridges: Reformulating Grounded Multimodal Named Entity Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinyuan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Han Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Di Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenkun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zan Wang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09989v4-abstract-short" style="display: inline;"> Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal task that aims to identify named entities, entity types and their corresponding visual regions. GMNER task exhibits two challenging properties: 1) The weak correlation between image-text pairs in social media results in a significant portion of named entities being ungroundable. 2) There exists a distinction between coars… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09989v4-abstract-full').style.display = 'inline'; document.getElementById('2402.09989v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09989v4-abstract-full" style="display: none;"> Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal task that aims to identify named entities, entity types and their corresponding visual regions. GMNER task exhibits two challenging properties: 1) The weak correlation between image-text pairs in social media results in a significant portion of named entities being ungroundable. 2) There exists a distinction between coarse-grained referring expressions commonly used in similar tasks (e.g., phrase localization, referring expression comprehension) and fine-grained named entities. In this paper, we propose RiVEG, a unified framework that reformulates GMNER into a joint MNER-VE-VG task by leveraging large language models (LLMs) as a connecting bridge. This reformulation brings two benefits: 1) It maintains the optimal MNER performance and eliminates the need for employing object detection methods to pre-extract regional features, thereby naturally addressing two major limitations of existing GMNER methods. 2) The introduction of entity expansion expression and Visual Entailment (VE) module unifies Visual Grounding (VG) and Entity Grounding (EG). It enables RiVEG to effortlessly inherit the Visual Entailment and Visual Grounding capabilities of any current or prospective multimodal pretraining models. Extensive experiments demonstrate that RiVEG outperforms state-of-the-art methods on the existing GMNER dataset and achieves absolute leads of 10.65%, 6.21%, and 8.83% in all three subtasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09989v4-abstract-full').style.display = 'none'; document.getElementById('2402.09989v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Findings of ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02344">arXiv:2402.02344</a> <span> [<a href="https://arxiv.org/pdf/2402.02344">pdf</a>, <a href="https://arxiv.org/ps/2402.02344">ps</a>, <a href="https://arxiv.org/format/2402.02344">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/JIOT.2024.3370161">10.1109/JIOT.2024.3370161 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> On Secure mmWave RSMA Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+H">Hongjiang Lei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Sha Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinhu Chen</a>, <a href="/search/cs?searchtype=author&query=Ansari%2C+I+S">Imran Shafique Ansari</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yun Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaofeng Pan</a>, <a href="/search/cs?searchtype=author&query=Alouini%2C+M">Mohamed-Slim Alouini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02344v2-abstract-short" style="display: inline;"> This work considers a multiple-input-single-output mmWave RSMA system wherein a base station serves two users in the presence of a passive eavesdropper. Different eavesdropping scenarios are considered corresponding to the overlapped resolvable paths between the main and the wiretap channels under the considered transmission schemes. The analytical expressions for the secrecy outage probability ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02344v2-abstract-full').style.display = 'inline'; document.getElementById('2402.02344v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02344v2-abstract-full" style="display: none;"> This work considers a multiple-input-single-output mmWave RSMA system wherein a base station serves two users in the presence of a passive eavesdropper. Different eavesdropping scenarios are considered corresponding to the overlapped resolvable paths between the main and the wiretap channels under the considered transmission schemes. The analytical expressions for the secrecy outage probability are derived respectively through the Gaussian Chebyshev quadrature method. Monte Carlo simulation results are presented to validate the correctness of the derived analytical expressions and demonstrate the effects of system parameters on the SOP of the considered mmWave RSMA systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02344v2-abstract-full').style.display = 'none'; document.getElementById('2402.02344v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages,8 figures, accepted by IEEE Internet of Things Journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01180">arXiv:2402.01180</a> <span> [<a href="https://arxiv.org/pdf/2402.01180">pdf</a>, <a href="https://arxiv.org/format/2402.01180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Real-time Extended Reality Video Transmission Optimization Based on Frame-priority Scheduling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Guangjin Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shugong Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaojing Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanzan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01180v2-abstract-short" style="display: inline;"> Extended reality (XR) is one of the most important applications of 5G. For real-time XR video transmission in 5G networks, a low latency and high data rate are required. In this paper, we propose a resource allocation scheme based on frame-priority scheduling to meet these requirements. The optimization problem is modelled as a frame-priority-based radio resource scheduling problem to improve tran… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01180v2-abstract-full').style.display = 'inline'; document.getElementById('2402.01180v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01180v2-abstract-full" style="display: none;"> Extended reality (XR) is one of the most important applications of 5G. For real-time XR video transmission in 5G networks, a low latency and high data rate are required. In this paper, we propose a resource allocation scheme based on frame-priority scheduling to meet these requirements. The optimization problem is modelled as a frame-priority-based radio resource scheduling problem to improve transmission quality. We propose a scheduling framework based on multi-step Deep Q-network (MS-DQN) and design a neural network model based on convolutional neural network (CNN). Simulation results show that the scheduling framework based on frame-priority and MS-DQN can improve transmission quality by 49.9%-80.2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01180v2-abstract-full').style.display = 'none'; document.getElementById('2402.01180v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15628">arXiv:2401.15628</a> <span> [<a href="https://arxiv.org/pdf/2401.15628">pdf</a>, <a href="https://arxiv.org/format/2401.15628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> A Micro-Ellipsoid Model for Wet Porous Materials Rendering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gaole Pan</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yuang Cui</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Beibei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15628v3-abstract-short" style="display: inline;"> Wet porous materials, like wet ground, moist walls, or wet cloth, are common in the real world. These materials consist of transmittable particles surrounded by liquid, where the individual particle is invisible in the macroscopic view. While modeling wet porous materials is critical for various applications, a physically based model for wet porous materials is still absent. In this paper, we mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15628v3-abstract-full').style.display = 'inline'; document.getElementById('2401.15628v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15628v3-abstract-full" style="display: none;"> Wet porous materials, like wet ground, moist walls, or wet cloth, are common in the real world. These materials consist of transmittable particles surrounded by liquid, where the individual particle is invisible in the macroscopic view. While modeling wet porous materials is critical for various applications, a physically based model for wet porous materials is still absent. In this paper, we model these appearances in the media domain by extending the anisotropic radiative transfer equation to model porosity and saturation. Then, we introduce a novel particle model -- micro-ellipsoid -- by treating each particle as a transmittable ellipsoid, analogous to a micro-flake, to statistically characterize the overall optical behavior of the medium. This way, the foundational theory for media with porosity and saturation is established. Building upon this new medium, we further propose a practical bidirectional scattering distribution function (BSDF) model within the position-free framework--WetSpongeCake. As a result, our WetSpongeCake model is able to represent various appearances of wet porous materials using physical parameters (e.g., porosity and saturation), allowing both reflection and transmission. We validated our model through several examples: a piece of wet cloth, sand saturated with different liquids, or damp sculptures, demonstrating its ability to match real-world appearances closely. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15628v3-abstract-full').style.display = 'none'; document.getElementById('2401.15628v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.14652">arXiv:2401.14652</a> <span> [<a href="https://arxiv.org/pdf/2401.14652">pdf</a>, <a href="https://arxiv.org/format/2401.14652">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> LitE-SNN: Designing Lightweight and Efficient Spiking Neural Network through Spatial-Temporal Compressive Network Search and Joint Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qianhui Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiaqi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Malu Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.14652v2-abstract-short" style="display: inline;"> Spiking Neural Networks (SNNs) mimic the information-processing mechanisms of the human brain and are highly energy-efficient, making them well-suited for low-power edge devices. However, the pursuit of accuracy in current studies leads to large, long-timestep SNNs, conflicting with the resource constraints of these devices. In order to design lightweight and efficient SNNs, we propose a new appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14652v2-abstract-full').style.display = 'inline'; document.getElementById('2401.14652v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.14652v2-abstract-full" style="display: none;"> Spiking Neural Networks (SNNs) mimic the information-processing mechanisms of the human brain and are highly energy-efficient, making them well-suited for low-power edge devices. However, the pursuit of accuracy in current studies leads to large, long-timestep SNNs, conflicting with the resource constraints of these devices. In order to design lightweight and efficient SNNs, we propose a new approach named LitE-SNN that incorporates both spatial and temporal compression into the automated network design process. Spatially, we present a novel Compressive Convolution block (CompConv) to expand the search space to support pruning and mixed-precision quantization. Temporally, we are the first to propose a compressive timestep search to identify the optimal number of timesteps under specific computation cost constraints. Finally, we formulate a joint optimization to simultaneously learn the architecture parameters and spatial-temporal compression strategies to achieve high performance while minimizing memory and computation costs. Experimental results on CIFAR-10, CIFAR-100, and Google Speech Command datasets demonstrate our proposed LitE-SNNs can achieve competitive or even higher accuracy with remarkably smaller model sizes and fewer computation costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.14652v2-abstract-full').style.display = 'none'; document.getElementById('2401.14652v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.05363">arXiv:2401.05363</a> <span> [<a href="https://arxiv.org/pdf/2401.05363">pdf</a>, <a href="https://arxiv.org/format/2401.05363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Sleep Staging via Multi-Level Domain Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiquan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sha Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haiteng Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shijian Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tao Li</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+G">Gang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.05363v4-abstract-short" style="display: inline;"> Automatic sleep staging is essential for sleep assessment and disorder diagnosis. Most existing methods depend on one specific dataset and are limited to be generalized to other unseen datasets, for which the training data and testing data are from the same dataset. In this paper, we introduce domain generalization into automatic sleep staging and propose the task of generalizable sleep staging wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05363v4-abstract-full').style.display = 'inline'; document.getElementById('2401.05363v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.05363v4-abstract-full" style="display: none;"> Automatic sleep staging is essential for sleep assessment and disorder diagnosis. Most existing methods depend on one specific dataset and are limited to be generalized to other unseen datasets, for which the training data and testing data are from the same dataset. In this paper, we introduce domain generalization into automatic sleep staging and propose the task of generalizable sleep staging which aims to improve the model generalization ability to unseen datasets. Inspired by existing domain generalization methods, we adopt the feature alignment idea and propose a framework called SleepDG to solve it. Considering both of local salient features and sequential features are important for sleep staging, we propose a Multi-level Feature Alignment combining epoch-level and sequence-level feature alignment to learn domain-invariant feature representations. Specifically, we design an Epoch-level Feature Alignment to align the feature distribution of each single sleep epoch among different domains, and a Sequence-level Feature Alignment to minimize the discrepancy of sequential features among different domains. SleepDG is validated on five public datasets, achieving the state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.05363v4-abstract-full').style.display = 'none'; document.getElementById('2401.05363v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the Thirty-Eighth AAAI Conference on Artificial Intelligence (AAAI-24)</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Pan%2C+G&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Pan%2C+G&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Pan%2C+G&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Pan%2C+G&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository